From 7933771fdfd8590c892935b23e2bf3816100db36 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 2 Oct 2024 22:13:46 +0100 Subject: [PATCH 001/137] wip --- tools/vendor_datasets.py | 660 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 660 insertions(+) create mode 100644 tools/vendor_datasets.py diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py new file mode 100644 index 000000000..003e55062 --- /dev/null +++ b/tools/vendor_datasets.py @@ -0,0 +1,660 @@ +from __future__ import annotations + +import json +import pkgutil +import sys +import textwrap +from functools import partial +from io import BytesIO +from pathlib import Path +from typing import Any, Iterable, Literal, cast +from urllib.request import urlopen + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +import pandas as pd +import polars as pl + +# This is the tag in http://github.com/vega/vega-datasets from +# which the datasets in this repository are sourced. +SOURCE_TAG = "v1.29.0" # 5 years ago +CURRENT_TAG = "v2.9.0" +USE_TAG = CURRENT_TAG + +BASE_URL = f"https://cdn.jsdelivr.net/npm/vega-datasets@{USE_TAG}/data/" + +ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] + + +def _load_dataset_info() -> dict[str, dict[str, Any]]: + """ + Loads dataset info from three package files. + + vega_datasets/datasets.json + vega_datasets/dataset_info.json + vega_datasets/local_datasets.json + + It returns a dictionary with dataset information. + """ + + def load_json(path: str) -> dict[str, Any]: + raw = pkgutil.get_data("vega_datasets", path) + if raw is None: + msg = f"Cannot locate package path vega_datasets:{path}" + raise ValueError(msg) + return json.loads(raw.decode()) + + info = load_json("datasets.json") + descriptions = load_json("dataset_info.json") + local_datasets = load_json("local_datasets.json") + + for name in info: + info[name]["is_local"] = name in local_datasets + for name in descriptions: + info[name].update(descriptions[name]) + + return info + + +class Dataset: + """Class to load a particular dataset by name.""" + + _instance_doc = """Loader for the {name} dataset. + + {data_description} + + {bundle_info} + Dataset source: {url} + + Usage + ----- + + >>> from vega_datasets import data + >>> {methodname} = data.{methodname}() + >>> type({methodname}) + {return_type} + + Equivalently, you can use + + >>> {methodname} = data('{name}') + + To get the raw dataset rather than the dataframe, use + + >>> data_bytes = data.{methodname}.raw() + >>> type(data_bytes) + bytes + + To find the dataset url, use + + >>> data.{methodname}.url + '{url}' + {additional_docs} + Attributes + ---------- + filename : string + The filename in which the dataset is stored + url : string + The full URL of the dataset at http://vega.github.io + format : string + The format of the dataset: usually one of {{'csv', 'tsv', 'json'}} + pkg_filename : string + The path to the local dataset within the vega_datasets package + is_local : bool + True if the dataset is available locally in the package + filepath : string + If is_local is True, the local file path to the dataset. + + {reference_info} + """ + _additional_docs = "" + _reference_info = """ + For information on this dataset, see https://github.com/vega/vega-datasets/ + """ + base_url = "https://cdn.jsdelivr.net/npm/vega-datasets@" + SOURCE_TAG + "/data/" + _dataset_info = _load_dataset_info() + _pd_read_kwds: dict[str, Any] = {} + _return_type = pd.DataFrame + name: str + + @classmethod + def init(cls, name: str) -> Dataset: + """Return an instance of this class or an appropriate subclass.""" + clsdict = { + subcls.name: subcls + for subcls in cls.__subclasses__() + if hasattr(subcls, "name") + } + return clsdict.get(name, cls)(name) + + def __init__(self, name: str): + info = self._infodict(name) + self.name = name + self.methodname = name.replace("-", "_") + self.filename = info["filename"] + self.url = self.base_url + info["filename"] + self.format = info["format"] + self.pkg_filename = "_data/" + self.filename + self.is_local = info["is_local"] + self.description = info.get("description", None) + self.references = info.get("references", None) + self.__doc__ = self._make_docstring() + + @classmethod + def list_datasets(cls) -> list[str]: + """Return a list of names of available datasets.""" + return sorted(cls._dataset_info.keys()) + + @classmethod + def list_local_datasets(cls) -> list[str]: + return sorted( + name for name, info in cls._dataset_info.items() if info["is_local"] + ) + + @classmethod + def _infodict(cls, name: str) -> dict[str, str]: + """Load the info dictionary for the given name.""" + info = cls._dataset_info.get(name, None) + if info is None: + msg = ( + f"No such dataset {name} exists, " + "use list_datasets() to get a list " + "of available datasets." + ) + raise ValueError(msg) + return info + + def raw(self, use_local: bool = True) -> bytes: + """Load the raw dataset from remote URL or local file.""" + if use_local and self.is_local: + out = pkgutil.get_data("vega_datasets", self.pkg_filename) + if out is not None: + return out + msg = f"Cannot locate package path vega_datasets:{self.pkg_filename}" + raise ValueError(msg) + else: + return urlopen(self.url).read() + + def __call__(self, use_local: bool = True, **kwargs) -> pd.DataFrame: + """Load and parse the dataset from remote URL or local file.""" + datasource = BytesIO(self.raw(use_local=use_local)) + + kwds = self._pd_read_kwds.copy() + kwds.update(kwargs) + + if self.format == "json": + return pd.read_json(datasource, **kwds) + elif self.format == "csv": + return pd.read_csv(datasource, **kwds) + elif self.format == "tsv": + kwds.setdefault("sep", "\t") + return pd.read_csv(datasource, **kwds) + else: + msg = ( + f"Unrecognized file format: {self.format}. " + "Valid options are ['json', 'csv', 'tsv']." + ) + raise ValueError(msg) + + @property + def filepath(self) -> str: + if not self.is_local: + msg = "filepath is only valid for local datasets" + raise ValueError(msg) + else: + return str((Path(__file__).parent / "_data" / self.filename).resolve()) + + def _make_docstring(self) -> str: + info = self._infodict(self.name) + + # construct, indent, and line-wrap dataset description + description = info.get("description", "") + if not description: + description = ( + "This dataset is described at " "https://github.com/vega/vega-datasets/" + ) + wrapper = textwrap.TextWrapper( + width=70, initial_indent="", subsequent_indent=4 * " " + ) + description = "\n".join(wrapper.wrap(description)) + + # construct, indent, and join references + reflist: Iterable[str] = info.get("references", []) + reflist = (f".. [{i + 1}] " + ref for i, ref in enumerate(reflist)) + wrapper = textwrap.TextWrapper( + width=70, initial_indent=4 * " ", subsequent_indent=7 * " " + ) + reflist = ("\n".join(wrapper.wrap(ref)) for ref in reflist) + references: str = "\n\n".join(reflist) + if references.strip(): + references = "References\n ----------\n" + references + + # add information about bundling of data + if self.is_local: + bundle_info = ( + "This dataset is bundled with vega_datasets; " + "it can be loaded without web access." + ) + else: + bundle_info = ( + "This dataset is not bundled with vega_datasets; " + "it requires web access to load." + ) + + return self._instance_doc.format( + additional_docs=self._additional_docs, + data_description=description, + reference_info=references, + bundle_info=bundle_info, + return_type=self._return_type, + **self.__dict__, + ) + + +def getattr_to_df(name: str, /) -> pl.DataFrame: + """Subset of what `Dataset` does.""" + js_name = name.replace("_", "-") + file_name = DATASETS_JSON[js_name]["filename"] + suffix = Path(file_name).suffix + if suffix in {".csv", ".json", ".tsv"}: + extension = cast(ExtSupported, suffix) + else: + raise NotImplementedError(suffix, file_name) + + url = f"{BASE_URL}{file_name}" + with urlopen(url) as f: + content = ext_fn(extension)(f) + return content + + +class DSet: + def __init__(self, name: str, /) -> None: + self.name: str = name + js_name = name.replace("_", "-") + file_name = DATASETS_JSON[js_name]["filename"] + suffix = Path(file_name).suffix + self.extension: ExtSupported + if suffix in {".csv", ".json", ".tsv"}: + self.extension = cast(ExtSupported, suffix) + else: + raise NotImplementedError(suffix, file_name) + + self.url: str = f"{BASE_URL}{file_name}" + + def __call__(self, **kwds: Any) -> pl.DataFrame: + with urlopen(self.url) as f: + content = ext_fn(self.extension, **kwds)(f) + return content + + def __repr__(self) -> str: + return ( + f"{type(self).__name__}(\n " + f"name={self.name!r},\n " + f"url={self.url!r}\n" + ")" + ) + + +def ext_fn(ext: ExtSupported, /): + """Very basic mapping to `polars` eager functions.""" + if ext == ".csv": + return pl.read_csv + elif ext == ".json": + return pl.read_json + elif ext == ".tsv": + return partial(pl.read_csv, separator="\t") + else: + raise + + +DATASET_NAMES_USED = [ + "airports", + "anscombe", + "barley", + "cars", + "co2_concentration", + "countries", + "disasters", + "driving", + "earthquakes", + "flights_2k", + "flights_5k", + "flights_airport", + "gapminder_health_income", + "github", + "income", + "iowa_electricity", + "iris", + "jobs", + "londonBoroughs", + "londonCentroids", + "londonTubeLines", + "monarchs", + "movies", + "normal_2d", + "ohlc", + "population", + "population_engineers_hurricanes", + "seattle_weather", + "sp500", + "stocks", + "unemployment", + "unemployment_across_industries", + "us_10m", + "us_employment", + "us_state_capitals", + "us_unemployment", + "wheat", + "windvectors", + "world_110m", + "zipcodes", +] + +DATASETS_JSON = { + # "7zip": {"filename": "7zip.png", "format": "png"}, + "airports": {"filename": "airports.csv", "format": "csv"}, + "annual-precip": {"filename": "annual-precip.json", "format": "json"}, + "anscombe": {"filename": "anscombe.json", "format": "json"}, + "barley": {"filename": "barley.json", "format": "json"}, + "birdstrikes": {"filename": "birdstrikes.json", "format": "json"}, + "budget": {"filename": "budget.json", "format": "json"}, + "budgets": {"filename": "budgets.json", "format": "json"}, + "burtin": {"filename": "burtin.json", "format": "json"}, + "cars": {"filename": "cars.json", "format": "json"}, + "climate": {"filename": "climate.json", "format": "json"}, + "co2-concentration": {"filename": "co2-concentration.csv", "format": "csv"}, + "countries": {"filename": "countries.json", "format": "json"}, + "crimea": {"filename": "crimea.json", "format": "json"}, + "disasters": {"filename": "disasters.csv", "format": "csv"}, + "driving": {"filename": "driving.json", "format": "json"}, + "earthquakes": {"filename": "earthquakes.json", "format": "json"}, + # "ffox": {"filename": "ffox.png", "format": "png"}, + "flare": {"filename": "flare.json", "format": "json"}, + "flare-dependencies": {"filename": "flare-dependencies.json", "format": "json"}, + "flights-10k": {"filename": "flights-10k.json", "format": "json"}, + "flights-200k": {"filename": "flights-200k.json", "format": "json"}, + "flights-20k": {"filename": "flights-20k.json", "format": "json"}, + "flights-2k": {"filename": "flights-2k.json", "format": "json"}, + "flights-3m": {"filename": "flights-3m.csv", "format": "csv"}, + "flights-5k": {"filename": "flights-5k.json", "format": "json"}, + "flights-airport": {"filename": "flights-airport.csv", "format": "csv"}, + "gapminder": {"filename": "gapminder.json", "format": "json"}, + "gapminder-health-income": { + "filename": "gapminder-health-income.csv", + "format": "csv", + }, + # "gimp": {"filename": "gimp.png", "format": "png"}, + "github": {"filename": "github.csv", "format": "csv"}, + "graticule": {"filename": "graticule.json", "format": "json"}, + "income": {"filename": "income.json", "format": "json"}, + "iowa-electricity": {"filename": "iowa-electricity.csv", "format": "csv"}, + "iris": {"filename": "iris.json", "format": "json"}, + "jobs": {"filename": "jobs.json", "format": "json"}, + "la-riots": {"filename": "la-riots.csv", "format": "csv"}, + "londonBoroughs": {"filename": "londonBoroughs.json", "format": "json"}, + "londonCentroids": {"filename": "londonCentroids.json", "format": "json"}, + "londonTubeLines": {"filename": "londonTubeLines.json", "format": "json"}, + "lookup_groups": {"filename": "lookup_groups.csv", "format": "csv"}, + "lookup_people": {"filename": "lookup_people.csv", "format": "csv"}, + "miserables": {"filename": "miserables.json", "format": "json"}, + "monarchs": {"filename": "monarchs.json", "format": "json"}, + "movies": {"filename": "movies.json", "format": "json"}, + "normal-2d": {"filename": "normal-2d.json", "format": "json"}, + "obesity": {"filename": "obesity.json", "format": "json"}, + "ohlc": {"filename": "ohlc.json", "format": "json"}, + "points": {"filename": "points.json", "format": "json"}, + "population": {"filename": "population.json", "format": "json"}, + "population_engineers_hurricanes": { + "filename": "population_engineers_hurricanes.csv", + "format": "csv", + }, + "seattle-temps": {"filename": "seattle-temps.csv", "format": "csv"}, + "seattle-weather": {"filename": "seattle-weather.csv", "format": "csv"}, + "sf-temps": {"filename": "sf-temps.csv", "format": "csv"}, + "sp500": {"filename": "sp500.csv", "format": "csv"}, + "stocks": {"filename": "stocks.csv", "format": "csv"}, + "udistrict": {"filename": "udistrict.json", "format": "json"}, + "unemployment": {"filename": "unemployment.tsv", "format": "tsv"}, + "unemployment-across-industries": { + "filename": "unemployment-across-industries.json", + "format": "json", + }, + "uniform-2d": {"filename": "uniform-2d.json", "format": "json"}, + "us-10m": {"filename": "us-10m.json", "format": "json"}, + "us-employment": {"filename": "us-employment.csv", "format": "csv"}, + "us-state-capitals": {"filename": "us-state-capitals.json", "format": "json"}, + "volcano": {"filename": "volcano.json", "format": "json"}, + "weather": {"filename": "weather.json", "format": "json"}, + "weball26": {"filename": "weball26.json", "format": "json"}, + "wheat": {"filename": "wheat.json", "format": "json"}, + "windvectors": {"filename": "windvectors.csv", "format": "csv"}, + "world-110m": {"filename": "world-110m.json", "format": "json"}, + "zipcodes": {"filename": "zipcodes.csv", "format": "csv"}, +} + + +class Stocks(Dataset): + name = "stocks" + _additional_docs = """ + For convenience, the stocks dataset supports pivoted output using the + optional `pivoted` keyword. If pivoted is set to True, each company's + price history will be returned in a separate column: + + >>> df = data.stocks() # not pivoted + >>> df.head(3) + symbol date price + 0 MSFT 2000-01-01 39.81 + 1 MSFT 2000-02-01 36.35 + 2 MSFT 2000-03-01 43.22 + + >>> df_pivoted = data.stocks(pivoted=True) + >>> df_pivoted.head() + symbol AAPL AMZN GOOG IBM MSFT + date + 2000-01-01 25.94 64.56 NaN 100.52 39.81 + 2000-02-01 28.66 68.87 NaN 92.11 36.35 + 2000-03-01 33.95 67.00 NaN 106.11 43.22 + """ + _pd_read_kwds = {"parse_dates": ["date"]} + + def __call__(self, pivoted=False, use_local=True, **kwargs): + """ + Load and parse the dataset from remote URL or local file. + + Parameters + ---------- + pivoted : boolean, default False + If True, then pivot data so that each stock is in its own column. + use_local : boolean + If True (default), then attempt to load the dataset locally. If + False or if the dataset is not available locally, then load the + data from an external URL. + **kwargs : + additional keyword arguments are passed to data parser (usually + pd.read_csv or pd.read_json, depending on the format of the data + source) + + Returns + ------- + data : DataFrame + parsed data + """ + __doc__ = super().__call__.__doc__ # noqa:F841 + data = super().__call__(use_local=use_local, **kwargs) + if pivoted: + data = data.pivot(index="date", columns="symbol", values="price") + return data + + +class Cars(Dataset): + name = "cars" + _pd_read_kwds = {"convert_dates": ["Year"]} + + +class Climate(Dataset): + name = "climate" + _pd_read_kwds = {"convert_dates": ["DATE"]} + + +class Github(Dataset): + name = "github" + _pd_read_kwds = {"parse_dates": ["time"]} + + +class IowaElectricity(Dataset): + name = "iowa-electricity" + _pd_read_kwds = {"parse_dates": ["year"]} + + +class LARiots(Dataset): + name = "la-riots" + _pd_read_kwds = {"parse_dates": ["death_date"]} + + +class Miserables(Dataset): + name = "miserables" + _return_type = tuple + _additional_docs = """ + The miserables data contains two dataframes, ``nodes`` and ``links``, + both of which are returned from this function. + """ + + def __call__(self, use_local=True, **kwargs): + __doc__ = super().__call__.__doc__ # noqa:F841 + dct = json.loads(self.raw(use_local=use_local).decode(), **kwargs) + nodes = pd.DataFrame.from_records(dct["nodes"], index="index") + links = pd.DataFrame.from_records(dct["links"]) + return nodes, links + + +class SeattleTemps(Dataset): + name = "seattle-temps" + _pd_read_kwds = {"parse_dates": ["date"]} + + +class SeattleWeather(Dataset): + name = "seattle-weather" + _pd_read_kwds = {"parse_dates": ["date"]} + + +class SFTemps(Dataset): + name = "sf-temps" + _pd_read_kwds = {"parse_dates": ["date"]} + + +class Sp500(Dataset): + name = "sp500" + _pd_read_kwds = {"parse_dates": ["date"]} + + +class UnemploymentAcrossIndustries(Dataset): + name = "unemployment-across-industries" + _pd_read_kwds = {"convert_dates": ["date"]} + + +class US_10M(Dataset): + name = "us-10m" + _return_type = dict + _additional_docs = """ + The us-10m dataset is a TopoJSON file, with a structure that is not + suitable for storage in a dataframe. For this reason, the loader returns + a simple Python dictionary. + """ + + def __call__(self, use_local=True, **kwargs): + __doc__ = super().__call__.__doc__ # noqa:F841 + return json.loads(self.raw(use_local=use_local).decode(), **kwargs) + + +class World_110M(Dataset): + name = "world-110m" + _return_type = dict + _additional_docs = """ + The world-100m dataset is a TopoJSON file, with a structure that is not + suitable for storage in a dataframe. For this reason, the loader returns + a simple Python dictionary. + """ + + def __call__(self, use_local=True, **kwargs): + __doc__ = super().__call__.__doc__ # noqa:F841 + return json.loads(self.raw(use_local=use_local).decode(), **kwargs) + + +class ZIPCodes(Dataset): + name = "zipcodes" + _pd_read_kwds = {"dtype": {"zip_code": "object"}} + + +class DataLoader: + """ + Load a dataset from a local file or remote URL. + + There are two ways to call this; for example to load the iris dataset, you + can call this object and pass the dataset name by string: + + >>> from vega_datasets import data + >>> df = data("iris") + + or you can call the associated named method: + + >>> df = data.iris() + + Optionally, additional parameters can be passed to either of these + + Optional parameters + ------------------- + return_raw : boolean + If True, then return the raw string or bytes. + If False (default), then return a pandas dataframe. + use_local : boolean + If True (default), then attempt to load the dataset locally. If + False or if the dataset is not available locally, then load the + data from an external URL. + **kwargs : + additional keyword arguments are passed to the pandas parsing function, + either ``read_csv()`` or ``read_json()`` depending on the data format. + """ + + _datasets = {name.replace("-", "_"): name for name in Dataset.list_datasets()} + + def list_datasets(self): + return Dataset.list_datasets() + + def __call__(self, name, return_raw=False, use_local=True, **kwargs): + loader = getattr(self, name.replace("-", "_")) + if return_raw: + return loader.raw(use_local=use_local, **kwargs) + else: + return loader(use_local=use_local, **kwargs) + + def __getattr__(self, dataset_name): + if dataset_name in self._datasets: + return Dataset.init(self._datasets[dataset_name]) + else: + msg = f"No dataset named '{dataset_name}'" + raise AttributeError(msg) + + def __dir__(self): + return list(self._datasets.keys()) + + +class LocalDataLoader(DataLoader): + _datasets = {name.replace("-", "_"): name for name in Dataset.list_local_datasets()} + + def list_datasets(self): + return Dataset.list_local_datasets() + + def __getattr__(self, dataset_name): + if dataset_name in self._datasets: + return Dataset.init(self._datasets[dataset_name]) + elif dataset_name in DataLoader._datasets: + msg = ( + f"'{dataset_name}' dataset is not available locally. To " + f"download it, use ``vega_datasets.data.{dataset_name}()" + ) + raise ValueError(msg) + else: + msg = f"No dataset named '{dataset_name}'" + raise AttributeError(msg) From b30081e9de975bed60247c65b477012d68b4e132 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 18:33:06 +0100 Subject: [PATCH 002/137] feat(DRAFT): Minimal reimplementation --- tools/vendor_datasets.py | 478 ++------------------------------------- 1 file changed, 17 insertions(+), 461 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 003e55062..4a435c253 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -1,13 +1,9 @@ from __future__ import annotations -import json -import pkgutil import sys -import textwrap -from functools import partial -from io import BytesIO +from functools import cached_property, partial from pathlib import Path -from typing import Any, Iterable, Literal, cast +from typing import Any, Literal, cast from urllib.request import urlopen if sys.version_info >= (3, 10): @@ -15,7 +11,6 @@ else: from typing_extensions import TypeAlias -import pandas as pd import polars as pl # This is the tag in http://github.com/vega/vega-datasets from @@ -29,247 +24,7 @@ ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] -def _load_dataset_info() -> dict[str, dict[str, Any]]: - """ - Loads dataset info from three package files. - - vega_datasets/datasets.json - vega_datasets/dataset_info.json - vega_datasets/local_datasets.json - - It returns a dictionary with dataset information. - """ - - def load_json(path: str) -> dict[str, Any]: - raw = pkgutil.get_data("vega_datasets", path) - if raw is None: - msg = f"Cannot locate package path vega_datasets:{path}" - raise ValueError(msg) - return json.loads(raw.decode()) - - info = load_json("datasets.json") - descriptions = load_json("dataset_info.json") - local_datasets = load_json("local_datasets.json") - - for name in info: - info[name]["is_local"] = name in local_datasets - for name in descriptions: - info[name].update(descriptions[name]) - - return info - - class Dataset: - """Class to load a particular dataset by name.""" - - _instance_doc = """Loader for the {name} dataset. - - {data_description} - - {bundle_info} - Dataset source: {url} - - Usage - ----- - - >>> from vega_datasets import data - >>> {methodname} = data.{methodname}() - >>> type({methodname}) - {return_type} - - Equivalently, you can use - - >>> {methodname} = data('{name}') - - To get the raw dataset rather than the dataframe, use - - >>> data_bytes = data.{methodname}.raw() - >>> type(data_bytes) - bytes - - To find the dataset url, use - - >>> data.{methodname}.url - '{url}' - {additional_docs} - Attributes - ---------- - filename : string - The filename in which the dataset is stored - url : string - The full URL of the dataset at http://vega.github.io - format : string - The format of the dataset: usually one of {{'csv', 'tsv', 'json'}} - pkg_filename : string - The path to the local dataset within the vega_datasets package - is_local : bool - True if the dataset is available locally in the package - filepath : string - If is_local is True, the local file path to the dataset. - - {reference_info} - """ - _additional_docs = "" - _reference_info = """ - For information on this dataset, see https://github.com/vega/vega-datasets/ - """ - base_url = "https://cdn.jsdelivr.net/npm/vega-datasets@" + SOURCE_TAG + "/data/" - _dataset_info = _load_dataset_info() - _pd_read_kwds: dict[str, Any] = {} - _return_type = pd.DataFrame - name: str - - @classmethod - def init(cls, name: str) -> Dataset: - """Return an instance of this class or an appropriate subclass.""" - clsdict = { - subcls.name: subcls - for subcls in cls.__subclasses__() - if hasattr(subcls, "name") - } - return clsdict.get(name, cls)(name) - - def __init__(self, name: str): - info = self._infodict(name) - self.name = name - self.methodname = name.replace("-", "_") - self.filename = info["filename"] - self.url = self.base_url + info["filename"] - self.format = info["format"] - self.pkg_filename = "_data/" + self.filename - self.is_local = info["is_local"] - self.description = info.get("description", None) - self.references = info.get("references", None) - self.__doc__ = self._make_docstring() - - @classmethod - def list_datasets(cls) -> list[str]: - """Return a list of names of available datasets.""" - return sorted(cls._dataset_info.keys()) - - @classmethod - def list_local_datasets(cls) -> list[str]: - return sorted( - name for name, info in cls._dataset_info.items() if info["is_local"] - ) - - @classmethod - def _infodict(cls, name: str) -> dict[str, str]: - """Load the info dictionary for the given name.""" - info = cls._dataset_info.get(name, None) - if info is None: - msg = ( - f"No such dataset {name} exists, " - "use list_datasets() to get a list " - "of available datasets." - ) - raise ValueError(msg) - return info - - def raw(self, use_local: bool = True) -> bytes: - """Load the raw dataset from remote URL or local file.""" - if use_local and self.is_local: - out = pkgutil.get_data("vega_datasets", self.pkg_filename) - if out is not None: - return out - msg = f"Cannot locate package path vega_datasets:{self.pkg_filename}" - raise ValueError(msg) - else: - return urlopen(self.url).read() - - def __call__(self, use_local: bool = True, **kwargs) -> pd.DataFrame: - """Load and parse the dataset from remote URL or local file.""" - datasource = BytesIO(self.raw(use_local=use_local)) - - kwds = self._pd_read_kwds.copy() - kwds.update(kwargs) - - if self.format == "json": - return pd.read_json(datasource, **kwds) - elif self.format == "csv": - return pd.read_csv(datasource, **kwds) - elif self.format == "tsv": - kwds.setdefault("sep", "\t") - return pd.read_csv(datasource, **kwds) - else: - msg = ( - f"Unrecognized file format: {self.format}. " - "Valid options are ['json', 'csv', 'tsv']." - ) - raise ValueError(msg) - - @property - def filepath(self) -> str: - if not self.is_local: - msg = "filepath is only valid for local datasets" - raise ValueError(msg) - else: - return str((Path(__file__).parent / "_data" / self.filename).resolve()) - - def _make_docstring(self) -> str: - info = self._infodict(self.name) - - # construct, indent, and line-wrap dataset description - description = info.get("description", "") - if not description: - description = ( - "This dataset is described at " "https://github.com/vega/vega-datasets/" - ) - wrapper = textwrap.TextWrapper( - width=70, initial_indent="", subsequent_indent=4 * " " - ) - description = "\n".join(wrapper.wrap(description)) - - # construct, indent, and join references - reflist: Iterable[str] = info.get("references", []) - reflist = (f".. [{i + 1}] " + ref for i, ref in enumerate(reflist)) - wrapper = textwrap.TextWrapper( - width=70, initial_indent=4 * " ", subsequent_indent=7 * " " - ) - reflist = ("\n".join(wrapper.wrap(ref)) for ref in reflist) - references: str = "\n\n".join(reflist) - if references.strip(): - references = "References\n ----------\n" + references - - # add information about bundling of data - if self.is_local: - bundle_info = ( - "This dataset is bundled with vega_datasets; " - "it can be loaded without web access." - ) - else: - bundle_info = ( - "This dataset is not bundled with vega_datasets; " - "it requires web access to load." - ) - - return self._instance_doc.format( - additional_docs=self._additional_docs, - data_description=description, - reference_info=references, - bundle_info=bundle_info, - return_type=self._return_type, - **self.__dict__, - ) - - -def getattr_to_df(name: str, /) -> pl.DataFrame: - """Subset of what `Dataset` does.""" - js_name = name.replace("_", "-") - file_name = DATASETS_JSON[js_name]["filename"] - suffix = Path(file_name).suffix - if suffix in {".csv", ".json", ".tsv"}: - extension = cast(ExtSupported, suffix) - else: - raise NotImplementedError(suffix, file_name) - - url = f"{BASE_URL}{file_name}" - with urlopen(url) as f: - content = ext_fn(extension)(f) - return content - - -class DSet: def __init__(self, name: str, /) -> None: self.name: str = name js_name = name.replace("_", "-") @@ -435,226 +190,27 @@ def ext_fn(ext: ExtSupported, /): } -class Stocks(Dataset): - name = "stocks" - _additional_docs = """ - For convenience, the stocks dataset supports pivoted output using the - optional `pivoted` keyword. If pivoted is set to True, each company's - price history will be returned in a separate column: - - >>> df = data.stocks() # not pivoted - >>> df.head(3) - symbol date price - 0 MSFT 2000-01-01 39.81 - 1 MSFT 2000-02-01 36.35 - 2 MSFT 2000-03-01 43.22 - - >>> df_pivoted = data.stocks(pivoted=True) - >>> df_pivoted.head() - symbol AAPL AMZN GOOG IBM MSFT - date - 2000-01-01 25.94 64.56 NaN 100.52 39.81 - 2000-02-01 28.66 68.87 NaN 92.11 36.35 - 2000-03-01 33.95 67.00 NaN 106.11 43.22 - """ - _pd_read_kwds = {"parse_dates": ["date"]} - - def __call__(self, pivoted=False, use_local=True, **kwargs): - """ - Load and parse the dataset from remote URL or local file. - - Parameters - ---------- - pivoted : boolean, default False - If True, then pivot data so that each stock is in its own column. - use_local : boolean - If True (default), then attempt to load the dataset locally. If - False or if the dataset is not available locally, then load the - data from an external URL. - **kwargs : - additional keyword arguments are passed to data parser (usually - pd.read_csv or pd.read_json, depending on the format of the data - source) - - Returns - ------- - data : DataFrame - parsed data - """ - __doc__ = super().__call__.__doc__ # noqa:F841 - data = super().__call__(use_local=use_local, **kwargs) - if pivoted: - data = data.pivot(index="date", columns="symbol", values="price") - return data - - -class Cars(Dataset): - name = "cars" - _pd_read_kwds = {"convert_dates": ["Year"]} - - -class Climate(Dataset): - name = "climate" - _pd_read_kwds = {"convert_dates": ["DATE"]} - - -class Github(Dataset): - name = "github" - _pd_read_kwds = {"parse_dates": ["time"]} - - -class IowaElectricity(Dataset): - name = "iowa-electricity" - _pd_read_kwds = {"parse_dates": ["year"]} - - -class LARiots(Dataset): - name = "la-riots" - _pd_read_kwds = {"parse_dates": ["death_date"]} - - -class Miserables(Dataset): - name = "miserables" - _return_type = tuple - _additional_docs = """ - The miserables data contains two dataframes, ``nodes`` and ``links``, - both of which are returned from this function. - """ - - def __call__(self, use_local=True, **kwargs): - __doc__ = super().__call__.__doc__ # noqa:F841 - dct = json.loads(self.raw(use_local=use_local).decode(), **kwargs) - nodes = pd.DataFrame.from_records(dct["nodes"], index="index") - links = pd.DataFrame.from_records(dct["links"]) - return nodes, links - - -class SeattleTemps(Dataset): - name = "seattle-temps" - _pd_read_kwds = {"parse_dates": ["date"]} - - -class SeattleWeather(Dataset): - name = "seattle-weather" - _pd_read_kwds = {"parse_dates": ["date"]} - - -class SFTemps(Dataset): - name = "sf-temps" - _pd_read_kwds = {"parse_dates": ["date"]} - - -class Sp500(Dataset): - name = "sp500" - _pd_read_kwds = {"parse_dates": ["date"]} - - -class UnemploymentAcrossIndustries(Dataset): - name = "unemployment-across-industries" - _pd_read_kwds = {"convert_dates": ["date"]} - - -class US_10M(Dataset): - name = "us-10m" - _return_type = dict - _additional_docs = """ - The us-10m dataset is a TopoJSON file, with a structure that is not - suitable for storage in a dataframe. For this reason, the loader returns - a simple Python dictionary. - """ - - def __call__(self, use_local=True, **kwargs): - __doc__ = super().__call__.__doc__ # noqa:F841 - return json.loads(self.raw(use_local=use_local).decode(), **kwargs) - - -class World_110M(Dataset): - name = "world-110m" - _return_type = dict - _additional_docs = """ - The world-100m dataset is a TopoJSON file, with a structure that is not - suitable for storage in a dataframe. For this reason, the loader returns - a simple Python dictionary. - """ - - def __call__(self, use_local=True, **kwargs): - __doc__ = super().__call__.__doc__ # noqa:F841 - return json.loads(self.raw(use_local=use_local).decode(), **kwargs) - - -class ZIPCodes(Dataset): - name = "zipcodes" - _pd_read_kwds = {"dtype": {"zip_code": "object"}} - - class DataLoader: - """ - Load a dataset from a local file or remote URL. + @cached_property + def _dataset_names(self) -> list[str]: + return sorted(DATASETS_JSON) - There are two ways to call this; for example to load the iris dataset, you - can call this object and pass the dataset name by string: + @cached_property + def _py_js_names(self) -> dict[str, str]: + return {name.replace("-", "_"): name for name in self._dataset_names} - >>> from vega_datasets import data - >>> df = data("iris") + def list_datasets(self) -> list[str]: + return list(self._py_js_names) - or you can call the associated named method: - - >>> df = data.iris() - - Optionally, additional parameters can be passed to either of these - - Optional parameters - ------------------- - return_raw : boolean - If True, then return the raw string or bytes. - If False (default), then return a pandas dataframe. - use_local : boolean - If True (default), then attempt to load the dataset locally. If - False or if the dataset is not available locally, then load the - data from an external URL. - **kwargs : - additional keyword arguments are passed to the pandas parsing function, - either ``read_csv()`` or ``read_json()`` depending on the data format. - """ - - _datasets = {name.replace("-", "_"): name for name in Dataset.list_datasets()} - - def list_datasets(self): - return Dataset.list_datasets() - - def __call__(self, name, return_raw=False, use_local=True, **kwargs): - loader = getattr(self, name.replace("-", "_")) - if return_raw: - return loader.raw(use_local=use_local, **kwargs) + def __getattr__(self, name: str) -> Dataset: + if name in self._py_js_names: + return Dataset(self._py_js_names[name]) else: - return loader(use_local=use_local, **kwargs) - - def __getattr__(self, dataset_name): - if dataset_name in self._datasets: - return Dataset.init(self._datasets[dataset_name]) - else: - msg = f"No dataset named '{dataset_name}'" + msg = f"No dataset named '{name}'" raise AttributeError(msg) - def __dir__(self): - return list(self._datasets.keys()) - + def __dir__(self) -> list[str]: + return self.list_datasets() -class LocalDataLoader(DataLoader): - _datasets = {name.replace("-", "_"): name for name in Dataset.list_local_datasets()} - def list_datasets(self): - return Dataset.list_local_datasets() - - def __getattr__(self, dataset_name): - if dataset_name in self._datasets: - return Dataset.init(self._datasets[dataset_name]) - elif dataset_name in DataLoader._datasets: - msg = ( - f"'{dataset_name}' dataset is not available locally. To " - f"download it, use ``vega_datasets.data.{dataset_name}()" - ) - raise ValueError(msg) - else: - msg = f"No dataset named '{dataset_name}'" - raise AttributeError(msg) +data = DataLoader() From 279586b17dc766382b7a06e5874983e704789bf9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:26:09 +0100 Subject: [PATCH 003/137] refactor: Make version accessible via `data.source_tag` - Allow quickly switching between version tags https://github.com/vega/altair/discussions/3150#discussioncomment-6719752 --- tools/vendor_datasets.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 4a435c253..a50297420 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -3,7 +3,7 @@ import sys from functools import cached_property, partial from pathlib import Path -from typing import Any, Literal, cast +from typing import Any, ClassVar, Literal, cast from urllib.request import urlopen if sys.version_info >= (3, 10): @@ -15,20 +15,25 @@ # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. -SOURCE_TAG = "v1.29.0" # 5 years ago -CURRENT_TAG = "v2.9.0" -USE_TAG = CURRENT_TAG +_OLD_SOURCE_TAG = "v1.29.0" # 5 years ago +_CURRENT_SOURCE_TAG = "v2.9.0" + + +def _py_to_js(s: str, /): + return s.replace("_", "-") + + +def _js_to_py(s: str, /): + return s.replace("-", "_") -BASE_URL = f"https://cdn.jsdelivr.net/npm/vega-datasets@{USE_TAG}/data/" ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] class Dataset: - def __init__(self, name: str, /) -> None: + def __init__(self, name: str, /, base_url: str) -> None: self.name: str = name - js_name = name.replace("_", "-") - file_name = DATASETS_JSON[js_name]["filename"] + file_name = DATASETS_JSON[_py_to_js(name)]["filename"] suffix = Path(file_name).suffix self.extension: ExtSupported if suffix in {".csv", ".json", ".tsv"}: @@ -36,7 +41,7 @@ def __init__(self, name: str, /) -> None: else: raise NotImplementedError(suffix, file_name) - self.url: str = f"{BASE_URL}{file_name}" + self.url: str = f"{base_url}{file_name}" def __call__(self, **kwds: Any) -> pl.DataFrame: with urlopen(self.url) as f: @@ -191,22 +196,29 @@ def ext_fn(ext: ExtSupported, /): class DataLoader: + source_tag: ClassVar[str] = "v2.9.0" + _base_url_fmt: str = "https://cdn.jsdelivr.net/npm/vega-datasets@{0}/data/" + + @property + def base_url(self) -> str: + return self._base_url_fmt.format(self.source_tag) + @cached_property def _dataset_names(self) -> list[str]: return sorted(DATASETS_JSON) @cached_property def _py_js_names(self) -> dict[str, str]: - return {name.replace("-", "_"): name for name in self._dataset_names} + return {_js_to_py(name): name for name in self._dataset_names} def list_datasets(self) -> list[str]: return list(self._py_js_names) def __getattr__(self, name: str) -> Dataset: if name in self._py_js_names: - return Dataset(self._py_js_names[name]) + return Dataset(self._py_js_names[name], self.base_url) else: - msg = f"No dataset named '{name}'" + msg = f"No dataset named {name!r}" raise AttributeError(msg) def __dir__(self) -> list[str]: From 32150ad6b4b1f79b05be988bcf359e172ea017bf Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:47:09 +0100 Subject: [PATCH 004/137] refactor: `ext_fn` -> `Dataset.read_fn` --- tools/vendor_datasets.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index a50297420..e79ad6010 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -3,9 +3,13 @@ import sys from functools import cached_property, partial from pathlib import Path -from typing import Any, ClassVar, Literal, cast +from typing import Any, Callable, ClassVar, Literal from urllib.request import urlopen +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -18,6 +22,12 @@ _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" +ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] + + +def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: + return suffix in {".csv", ".json", ".tsv"} + def _py_to_js(s: str, /): return s.replace("_", "-") @@ -27,17 +37,19 @@ def _js_to_py(s: str, /): return s.replace("-", "_") -ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] - - class Dataset: + read_fn: ClassVar[dict[ExtSupported, Callable[..., pl.DataFrame]]] = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + } + def __init__(self, name: str, /, base_url: str) -> None: self.name: str = name file_name = DATASETS_JSON[_py_to_js(name)]["filename"] suffix = Path(file_name).suffix - self.extension: ExtSupported - if suffix in {".csv", ".json", ".tsv"}: - self.extension = cast(ExtSupported, suffix) + if is_ext_supported(suffix): + self.extension: ExtSupported = suffix else: raise NotImplementedError(suffix, file_name) @@ -45,7 +57,8 @@ def __init__(self, name: str, /, base_url: str) -> None: def __call__(self, **kwds: Any) -> pl.DataFrame: with urlopen(self.url) as f: - content = ext_fn(self.extension, **kwds)(f) + fn = self.read_fn[self.extension] + content = fn(f, **kwds) return content def __repr__(self) -> str: @@ -57,19 +70,7 @@ def __repr__(self) -> str: ) -def ext_fn(ext: ExtSupported, /): - """Very basic mapping to `polars` eager functions.""" - if ext == ".csv": - return pl.read_csv - elif ext == ".json": - return pl.read_json - elif ext == ".tsv": - return partial(pl.read_csv, separator="\t") - else: - raise - - -DATASET_NAMES_USED = [ +DATASET_NAMES_USED = ( "airports", "anscombe", "barley", @@ -110,7 +111,7 @@ def ext_fn(ext: ExtSupported, /): "windvectors", "world_110m", "zipcodes", -] +) DATASETS_JSON = { # "7zip": {"filename": "7zip.png", "format": "png"}, From f1d18a2d3baee9edbb9d17146c90b73a29d7905b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:47:57 +0100 Subject: [PATCH 005/137] docs: Add trailing docs to long literals --- tools/vendor_datasets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index e79ad6010..5b0f25fe8 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -112,6 +112,8 @@ def __repr__(self) -> str: "world_110m", "zipcodes", ) +"""Every name that is referenced in *at least* one example/test.""" + DATASETS_JSON = { # "7zip": {"filename": "7zip.png", "format": "png"}, @@ -194,6 +196,13 @@ def __repr__(self) -> str: "world-110m": {"filename": "world-110m.json", "format": "json"}, "zipcodes": {"filename": "zipcodes.csv", "format": "csv"}, } +"""Inlined `datasets.json`_. + +- Excluding images + +.. _datasets.json: + https://github.com/altair-viz/vega_datasets/blob/136e850447b49031f04baa137ce5c37a6678bbb1/vega_datasets/datasets.json +""" class DataLoader: From 4d3c5509f1e656adc08015f5456fe3f5671c7ecd Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:51:24 +0100 Subject: [PATCH 006/137] docs: Add module-level doc --- tools/vendor_datasets.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 5b0f25fe8..08c3094e7 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -1,3 +1,10 @@ +""" +Adapted from `altair-viz/vega_datasets`_. + +.. _altair-viz/vega_datasets: + https://github.com/altair-viz/vega_datasets +""" + from __future__ import annotations import sys From 3a284a5ea97ebe0ef500c9911eaeddebe88ad741 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 17:05:34 +0100 Subject: [PATCH 007/137] feat: Adds `.arrow` support To support [flights-200k.arrow](https://github.com/vega/vega-datasets/blob/f637f85f6a16f4b551b9e2eb669599cc21d77e69/data/flights-200k.arrow) --- tools/vendor_datasets.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 08c3094e7..26e1207c4 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -8,6 +8,7 @@ from __future__ import annotations import sys +import tempfile from functools import cached_property, partial from pathlib import Path from typing import Any, Callable, ClassVar, Literal @@ -29,11 +30,11 @@ _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" -ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] +ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: - return suffix in {".csv", ".json", ".tsv"} + return suffix in {".csv", ".json", ".tsv", ".arrow"} def _py_to_js(s: str, /): @@ -49,6 +50,7 @@ class Dataset: ".csv": pl.read_csv, ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), } def __init__(self, name: str, /, base_url: str) -> None: @@ -63,9 +65,10 @@ def __init__(self, name: str, /, base_url: str) -> None: self.url: str = f"{base_url}{file_name}" def __call__(self, **kwds: Any) -> pl.DataFrame: - with urlopen(self.url) as f: - fn = self.read_fn[self.extension] - content = fn(f, **kwds) + fn = self.read_fn[self.extension] + with tempfile.NamedTemporaryFile() as tmp, urlopen(self.url) as f: + tmp.write(f.read()) + content = fn(tmp, **kwds) return content def __repr__(self) -> str: From 22a50396822dc48d4ed63bae3c8837dc28dab6ad Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 19:46:40 +0100 Subject: [PATCH 008/137] feat: Add support for caching metadata --- .../_vega_datasets_data/metadata-schema.json | 12 ++ tools/_vega_datasets_data/metadata.parquet | Bin 0 -> 9100 bytes tools/vendor_datasets.py | 121 +++++++++++++++++- 3 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 tools/_vega_datasets_data/metadata-schema.json create mode 100644 tools/_vega_datasets_data/metadata.parquet diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/_vega_datasets_data/metadata-schema.json new file mode 100644 index 000000000..2b5b9d955 --- /dev/null +++ b/tools/_vega_datasets_data/metadata-schema.json @@ -0,0 +1,12 @@ +{ + "ext_supported": "bool", + "file_name": "str", + "name_collision": "bool", + "name_js": "str", + "name_py": "str", + "size": "int", + "suffix": "str", + "tag": "str", + "url_github": "str", + "url_npm": "str" +} \ No newline at end of file diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/_vega_datasets_data/metadata.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1ab0fb17143528da9cd460e84a0fb18a9f1d5b73 GIT binary patch literal 9100 zcmds7c|4SB`+o*87`tc8ls(4IkZnYaonv3JhMBP>%V0=J*=eyCB9e}>CKM+vgwlc{ zOS@2(N>SDp^?RN{r*qDG&Zpno`^WqH&6wxjuj{(6`+ME@bzct~qMb4bfD)j(JeZ9D znc=UXIv|ZeU=RTGk#1007Uj(X07j^gjX#F! zSh$Zjh(c#0(C+A998&vtG7yJkG+4_<=K-woKt=h7P^jKMG(`$+CkCkj6%|8*{Z_ep zB``(s186Z%oM_e~0JZ;lG_}Ayo4ww_DYyX1BrSmm3OWG>;-&{_JTZmI2@DQ^Ghhb@ z_GtLFo?Gv4ul1}qf}|dl^N~4szAnaIp(>G-WGq?xak=pQhw41+o5cb1TC>4Pi+ldn zm+PlC$Uc@#rY?$x`Mjg`Yv5 z6{4W49DY>iPJ+ih)$Fu^9r4FW`W3V@>M7QYlu2#cI?_oq6t@>j^+}pQ7jc|7>vM^T zOlgzn!6?V~mYf+t(mTE4-ugR;v99YQu|HMp|4H88^*2?cf?@mfWEU z@>u^yafQJrpJg|4NR-T+Bnv7>N0i%bX;#9VKhS*Y>N8i%w@=vmsEr((?uR|QR5O9S zuGHkN$akkNd7d(wDi)Mn$5-8AU)8RuX_P&+6+R z=;Iko3i4UO`#2h%$&Twnz>xo2R6|C912zA_%=q%ZnHdr`0)Uax3i_?u87IunlR^y) z^r!h#gI5@PphIwj9DlU5VQz2%`A17bu2~v}3AVB|3&S&}@c_BvBndrQSUW!LY~#uP z9Q{Ts?qBvOc(U|q<+sf40`X;~`IT==LRp*uSHK>4G~Bv={e8{C>2}}QY|G)a>4_%J z;=r8c$Ee)dvoW>`RSqQPY5QZ(m@>L|q~}g84iG-*y`gaH^LPtyZDuEgQVsa*pFnN&&60_DkTv3H z7G*uFH7+@t{>p8iO@6>!v<-G6CGFBJukJQv`Z0lE^c>S(^Ln6j6)F%Ibu5f&}zHhqK&$Uy;CJiFAd)E|4w`^5ipU~!Gg zh9JkIRbpE>9&m`sJ*@}sOH>d%$VgLVL#d^fe=wu#W zzWa;&;hxFwt~c39&`sQ?gST% z#lOG}f^cj8lQl0thFevP7| z(hhjk(~8C3SBFvA_b*Xt_#G;9AXLgUomfytiXm0F!Qa54n!%uc!=g-{QC+=U{Q8m3_%9gOr2s-i_R2cNGxXdhFJ2op8J7kD?!_A;nk* zD94-hN>tvN8p>v~j-60BVEV=Ca5mzG!=Zqz5%UKEPYYk9U3e&u+uzqA-dyIb4vP46txCpA^= zEv{}WiRnf+yZ3ry7?c?hCB2TH~2sn)%_?heQv6j8zF zTi~1xTloE4m7{L@yMJX-Cltp8Tv21hax;}N15#oomAp+$>xzwm-`G<3BWR{T zTq5EYGjP-_iG!nv_lY4OopRjfc{azsIAMsvV~Gt98X06G9r$lCO8|(}0Nx@opj8sV zCkhzY#V{KPB0jAHZkcl;g@9DTM%EEc1SHE|zyV*yaK;_C9s9b?J_Xz%#2_K29&h1G6M|fQh$op0rv0cDdh33gUdT;)L>>fLkbI{M$((Y zX*;3)!V2`FFAVpZx5%UjHDH)mI`oO$f+8Lg5^DdWkjyJhnALPmOnBfbY#=B4n2wQVR14q0*Fg`uLIP4+U>hIEm&HPE&&V zD3JrHWSWw%uLeoOM_q$JQC0U<@mBUBYpPKw1Qj(EWio+)_f;lokklv|B+O6nz~eRW z1T_L#1EPi}sA{N@NhCEqM4h0iqKYT^Xe#4X)O|EmR#oIBB&eneL^Z`?gkoiKI5*+< zO)XX{@3-D5;%lmJcOiQ!C>x`OsYm1>c0u3qOJdjC&H4ni@IHS1T|%3mTCvMLmoPAl zk!Gu5pW1tpR2W5Ryfds9aed%;Ubc`x+oTYSQQ0^un5RCb<7}_tP2tzpPv>%?i8`B= z$TjO3X9+hNxeHcW|D)%X@FI3Ly+&8ljicYW{Y_n%! zt>@ihW!oLIdwOrY2Eh*PgfZKC>fA<=!vPQOeX!FGOiqi$Xb5{O4WIGPVdi5+5SsJa z64O$DapI9X|9U4cj}tE+`y-(j%1Z>p-rcvw#PThS+=1#saM$R zH=l}V%jZdU?oPq9mN2?XU|Y^`iej-xhS>|h zD9c$>dzC7Ig&g-Sd4@rs<;>nDM?r|GwE3OMWJS}|VSO$~d z`Y%f^#$>f$=Q165WCM)M`I1Ak4Ju6TRFMxhxXCpQw!h48Rl7OnA&m2wyW;VA_akro z`@lA-ea+!!@;q!$AGAnSjT!IeOyb!<8tT}yD4kz&kDA)&?XKYC%iGe{cA}*4X-8eo z&`p*AA9mih?u3d&DPNtC#l%7+{}F$uelj5?xXb9x^5ssO0V|`7Glw2J%2n4XepH+G z8mbKH5M|3~{-o)+uIXrK7x{!`!}`ZfLzl-yhUV_x>$_r23>SRN?s7P{eN#pWV|(vM z9M#!^!&pX`^Ou7npXPcVbLz%_>9jDY7Aigq}T?m@)5(+is#5Jy0>3X z&kIHOWnMp%vq&(PxX|Gz_3V`!!y<@2mSoRH{<2I(qcIN=zux`of!@&WCHvuN-k9oX zFS(0Orz!ZRX0cGWr#pC@uDs86_vszkJZku?|E;B`&TTPy8Mc%Jx%?O735yu5L%FPq z<~wX2B;OvtJ9*wilJ#O8!r;;i66L;I%xT?7k(c<;Wl`OMlZ-eap4-^UM|HXCzY_%`Vh-%zIBxC@%}2iHf=sK09lRpAYpXa_9oHn+n+Lu>_ij7<6wDlNVdy8RxvOl>$MM_po!uLr(?_#G{=)rpP8;eB& zc*j=9>d8KKi!|~toW|Ii0DSM)n!=&&9?l!PUd6DnFC;x!$KGn-J5}kZ`ob-s`1$L% zyl>?0Jl~%BIWCNZo7e9gmT0)|iF)>Br20mSY^9kIj-^}m`QuzY>>*Axd0$AMwzl8G zMXkI^_Nwl*GUp3)=)7>OPh!rfGaPwAgH8vnbFU&y<7 z;Q$V?|IxcJSo1C@-ocQQ=Is;czcWlR*e6^mI3(zgj)mTe&!Yr88cJ({26{+O_unM) z=SZLmzk&e3JP{;`4}Em?s;EedhNa}KvHWT|4NEicF@uUNzb>5T%m4Chqqul{f{icW z3wQz?ogW{xROC7q%ozS^=e)en<3sdd=b?p9I~u&R)wGU;ub1<4UP$k#xV`hh$m68_ z?rLAUYe2cX7dvG36D7p`ISb56hl|rQ|NK;q`s}@NLux%%l&9Pi%HwMVwSnImowGIk82_*i znth7E{#w@r3S5xO=W@1vmc8USb;r8sO#bsM$LeOkJ{jiF@qxY(H8YgkY}5`z+q6zp zSF6?i+^$fq#S6t8i;AB!tN31Or=EJ!;u1Q!`-z2%M<-8pQD#%S5L>>}DMVIP2dyrQ zO(hc@5WFv^*R??#mss&#;Lu)M^`NWx*N)b2p+giLI+S}8x$Umf!5E@=7Qd!7b}*x) zpH&Bkll*jJF4gGNQ?5rdcii`?YpCL5%0~?KGs(&3{5by-LWl%6D=!!+ZsS})L1GdC zv8AsX4@DBTZ6EFHxrAp5TXxv1A!RQ%lhP=Un9hwjI+)TctMlnI;giU+o+8DYaipeta9}m z`o_brZCW#Wb+me0~@5|pg8sAs1Z33%x(2F&!AhsH}QxH<UgQpn* z+;NX@vtKilm8t(1o|WiQSnzs3@FW9(F3n?r3>k++7dY_4TOkHTw6n8z(xOqe`vj3N zH(8Y6kEW^tkw`RxenickPl@p5^gsm68%>DQK3|y z0vAo-YAd2Xk!WBGbtgK~Y1l$#Lpm{k`iM39M4|>vZ#SJDotO#SPBe0a9=1e%Gnf#R z8o~Ub54|6eXcGks4bg*|;40ItyUE6c2#N{*r|B8MQmoJmCOO+ty`7->Skw8@UTe62cyq14`{y^iG6`gj*7%1_HjXKiYw>>A`6;sa5o z>o1Bf5B*^RvnCqB>iP%$IY5#T=slrc(8mv6El5s3Sl`h6twMZ);3fDEt&fS?cCyn} zUy5nK&(9xT?^SxFV9URw2dm*9=Kni>3}LOU(sNNE(8wl%Q6wiq1hjrMc>OG(=2h^y zz*(Xlq%e3<{UCk8sqG3R4C#$Tq7vOcAl5{pIb7u$6i9R3N{DcAQVxX6e}nfQmls|y wldY=W#)K$uXZ=X>CUd;2lPwXzFxyZANntc>HRMzL!yj-OdfRUU{U6T%06c=}+W-In literal 0 HcmV?d00001 diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 26e1207c4..871ac14af 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -7,11 +7,12 @@ from __future__ import annotations +import json import sys import tempfile from functools import cached_property, partial from pathlib import Path -from typing import Any, Callable, ClassVar, Literal +from typing import Any, Callable, ClassVar, Literal, TypedDict from urllib.request import urlopen if sys.version_info >= (3, 13): @@ -25,12 +26,130 @@ import polars as pl + +class GitHubTree(TypedDict): + path: str + mode: str + type: str + sha: str + size: int + url: str + + +class GitHubTreeResponse(TypedDict): + sha: str + url: str + tree: list[GitHubTree] + truncated: bool + + +class GitHubBlobResponse(TypedDict): + content: str + sha: str + node_id: str + size: int | None + encoding: str + url: str + + +class ParsedTree(TypedDict): + file_name: str + name_js: str + name_py: str + suffix: str + size: int + url: str + ext_supported: bool + + +class ParsedTreeResponse(TypedDict): + tag: str + url: str + tree: list[ParsedTree] + + +_GITHUB_TREE_BASE_URL = "https://api.github.com/repos/vega/vega-datasets/git/trees/" +_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" +_SUB_DIR = "data" + + +def request_trees(tag: str, /) -> GitHubTreeResponse: + with urlopen(f"{_GITHUB_TREE_BASE_URL}{tag}") as response: + content: GitHubTreeResponse = json.load(response) + query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) + if data_url := next(query, None): + with urlopen(data_url) as response: + data_dir: GitHubTreeResponse = json.load(response) + return data_dir + else: + raise FileNotFoundError + + +def parse_github_tree(tree: GitHubTree, /) -> ParsedTree: + path = Path(tree["path"]) + return ParsedTree( + file_name=path.name, + name_js=path.stem, + name_py=_js_to_py(path.stem), + suffix=path.suffix, + size=tree["size"], + url=tree["url"], + ext_supported=is_ext_supported(path.suffix), + ) + + +def parse_github_tree_response( + tree: GitHubTreeResponse, /, tag: str +) -> ParsedTreeResponse: + return ParsedTreeResponse( + tag=tag, url=tree["url"], tree=[parse_github_tree(t) for t in tree["tree"]] + ) + + +def request_trees_to_df(tag: str, /) -> pl.DataFrame: + response = request_trees(tag) + parsed = parse_github_tree_response(response, tag=tag) + df = ( + pl.DataFrame(parsed["tree"]) + .lazy() + .rename({"url": "url_github"}) + .with_columns(name_collision=pl.col("name_py").is_duplicated(), tag=pl.lit(tag)) + .with_columns( + url_npm=pl.concat_str( + pl.lit(_NPM_BASE_URL), + pl.col("tag"), + pl.lit(f"/{_SUB_DIR}/"), + pl.col("file_name"), + ) + ) + .collect() + ) + return df.select(*sorted(df.columns)) + + +def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: + metadata = request_trees_to_df(tag) + if not fp.exists(): + fp.touch() + metadata.write_parquet(fp, compression="zstd", compression_level=17) + if write_schema: + schema = {name: tp.__name__ for name, tp in metadata.schema.to_python().items()} + fp_schema = fp.with_name(f"{fp.stem}-schema.json") + if not fp_schema.exists(): + fp_schema.touch() + with fp_schema.open("w") as f: + json.dump(schema, f, indent=2) + + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] +""" +- `'flights-200k.(arrow|json)'` key collison using stem +""" def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: From a618ffc6450922f602391b5511edda37b2fe325c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 21:49:43 +0100 Subject: [PATCH 009/137] feat: Support env var `VEGA_GITHUB_TOKEN` Not required for these requests, but may be helpful to avoid limits --- tools/vendor_datasets.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 871ac14af..259999fa0 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -8,12 +8,13 @@ from __future__ import annotations import json +import os import sys import tempfile from functools import cached_property, partial from pathlib import Path from typing import Any, Callable, ClassVar, Literal, TypedDict -from urllib.request import urlopen +from urllib.request import Request, urlopen if sys.version_info >= (3, 13): from typing import TypeIs @@ -73,8 +74,15 @@ class ParsedTreeResponse(TypedDict): _SUB_DIR = "data" +def request_github(url: str, /) -> Request: + headers = {} + if tok := os.environ.get("VEGA_GITHUB_TOKEN"): + headers["Authorization"] = tok + return Request(url, headers=headers) + + def request_trees(tag: str, /) -> GitHubTreeResponse: - with urlopen(f"{_GITHUB_TREE_BASE_URL}{tag}") as response: + with urlopen(request_github(f"{_GITHUB_TREE_BASE_URL}{tag}")) as response: content: GitHubTreeResponse = json.load(response) query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) if data_url := next(query, None): From 17923404866003e27a510be793ab65c290d8802a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 21:51:45 +0100 Subject: [PATCH 010/137] feat: Add support for multi-version metadata As an example, for comparing against the most recent I've added the 5 most recent --- .../metadata_v2.5.4-v2.9.0.parquet | Bin 0 -> 11354 bytes tools/vendor_datasets.py | 11 +++++++++++ 2 files changed, 11 insertions(+) create mode 100644 tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet diff --git a/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet b/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5626093db560b805b33261bdc5f6b7754ab3451d GIT binary patch literal 11354 zcmeHtc|25Y*#8;p*w+~|_H`0s>`S(>FJl@)!h^xs85ts>$et}(s;3enl|2N!-mSkyBTJWA3R8PppmljUo(yIm2w_ zV1x892FjnH8XygUARqvENMBZ15tV0y0FP3$Bcz#Ltq<^af_m3 z;{_(ioFzo7>L7@|Kf4OCajFkb|7k);BV};VUc_FQ-#vBZuwh4~iH8la3%~=&D863x z(p#7KeNDAS`@#rnfw`Y)@|J22?A&%SlyJy|zWVcu_S;Y8`N+AC54DCQ{X<1(RP5z< z?`6rI<8HaSowMP>viddqH0@a#oU$l?dZL72&GkGchwWnIq-}s&mBkmkT5Q=k&7-$G zhDk#c+S4LdLPHJ}sA(iBN9&Ovd07&j*rRL-6+U5j>xDsIu)3o>Ez;8PZIWAw%M1$9{Urs*Rvqx^Wy#=Mlob)*F^35m#o*otx$2xCrREkC(j`zheUYhor zfVB5jN+kwIbce6STU!l!;IbiYA}zN=q{ZvZdZUh26}3Lc0Jlsc^B<6M+lE9?ViGCv-1S#4iASE5^#X^t4TpML4_l4mvSrDgBzrsMP!7J%{fm zwMW0|tw{c?9_A9`(ZjIsZOPZy>C;~j9~hfCKs4mh!#>ke@xZx>2i8d6U=NZfku2{) z-iv^$gGu?|An(noYYCiU1kSQ=V1)6Hg9qL8r4j7O@$St&8p2u0o(AU~yS0|RD^xW_olJIm>Nl{S* zCvzgLc%cKDKZ8+H{5W1Fj|Ejc6s&ccLt2vmh(#%nOo?NSVd!vED+2Q8p1g@{ z5cL=)`H4s+VDE;lqb7-kOBgH$-bnZbsbJa;E52_XRg2cos*ofF(?+6@!jvR)2ea- z_U^7?=sm+3Pg09yYXeWc>arSuCbo)txII}Db#y3$E%eNm(AxrzfDLe%6?R+qk1;pa zG=Kh{kAzF+o$$r>kMC@gWVaL7)Z$)#opr_DWwLT;{?b2r^_EBRiHV~tm0A+82fCB7 z_nyD_vl`j&e~n?ayZl`q5$QH0RwKuv-)$iEQNc#Een2JYw1fR!#?_PQxKrb4UEW=* z@u|~l8W52g#d5Av%{4b!2sC)luu;!!;UWHqlkSh??%qBVc~6-`RGW*1=B<%%il@z@ znQ+$40Z++WiUUQ=&$=5HO`Zn86$Ehthn7QPJZcYUirvU5=o-7(nb|$tZkm1(IkAH; z>XPbVx$`XWi%sctbgd5vy*l7OgOs_&Ev#e7++rs(bqst-? z?i9&545y%7oxt`3^|AOhj_BQa3>`OeAlB0QCc%&E*+frvoAtB$r*i9?A9<@{OF!zf z;uR&gqx;xQpw+Qgoaqdc-R#?g6d@PQyU5x`gXyrF(A|GJsnnFi{3G>Z5}>0MvNpGk zN2ctzG{23P(2NOs&+C)G&@bW9AikvCWDG5<%om~+u6ne+`6l`Lse&VEfWSy_VD-4K z|InTLx$Ha(H)?L}he7+(Zi_DM-rtRjFOQXWF5BH6d&uY}4bQrB`0$f+fNnYm1RcdO ze0K`zj6nRaoWgITngUCczdMEBMl~;cim}Asox*oiGxBn$n@J5Y<=Rv@Y?R`lln5Q(?{)O2BjGU@Dx3ZVgBY47^#@_Bd>$82o7&& zLyhoI!1y?SNVD6kCwCs@+2!;;Ibfr|pm3JNT7f)rThfv{v)hvnf7SSlfY zgeAjIu&kxPl0{IPa~48{rNN?^AO)7K7R9DMw0*R7WdN#WPN`3Q+ns>%lP*; zF|Xp*P}l^I?cB$A$OZ|;z!## z^RFC9M&hwhSm6ckhGqH!6W}UyS~7%e2E@igIvIdt z<_TF|opl@6JDw6nK1vMkd^rx068}IMl281}OY|Mut5PCog`f?c((rjfGBt1kGI7j)X-fh*&56dB{-Y~aX%<> z*|FwcA(VSw_mbJ&B)Ot#esS8Sboc1cJE!{(KjJZI5xA)kI*c1{F&Q9Ps+5dG-d=cJ z2V*?^c56d)N1D@2^}DF4>Cc?eO}09O`%KKhxO*Bs_-gE(U+6c-Oti9QFz6Jo;pw+b55Fi@@$%fEmL1?oebl+5^4JCN5>$wbpzlWeI zbt!uOO(3Z0`)F9a;9tB%q;Q)$z<)=d9~rDJCDd1D>>{}+C%d>7AwD2!?#p2NkVU!* zk_Y|?vsmkqc_spMuQZ1oyvBV8;$2RmmN= zs{^;YG&(u^L{j5sY4#WRAkK{(t5DX!|FP%tREcLt0y?q-(|2C;kW>8Yf;avuT(upa zcdAEAx#I8R(jIH1wfBByX^_ZU38uQvWYADyZ!EV(WAHR_s?(s&rqUSd(S=O3(y}iGI6Q=DEgGMy6&6XmfKjgoX|p&^5cTBosA})mD-FQPV=QzTJDv=qo3hk1+V3 z2|JnHS64`IIARD=e1FXaF3{p8^XXasOO;YL*~VR^9_WVomDFl|j^u&NKS-EeB3W9r zH*as--Sh6r zrZZ$yS z$~-0I6juVF7vB>!@z{d&;?U(e^kS8p0-17ZrrpFQ8P&U6$X-1BRww|c_uv#o*FZE7x=G=1GX|;1cJ(qE=u2be_#RXZ2P;veje)T%m zHm`1n$W7~gE`CPJA|AOwO5yCqX5%y}@-uCf92|zU8qg02I|9rPsq(U!oAlHNW?T#3 zIR(=q4tUl2@sA`u=9lCt5H&&-J@&cJ^?+Sfm0MeL%IbW+qCvi%$LmBLI%|9(@{)5( zD%@iDjVY%GtG)g4%q?cCIxx@LCA_Nb`S~|rkvlCoEN>E(ur&wpC7dOxk)msISI$pZ z*sVC&>dLn6xxT#bO8!{T3w-l|;YX|m9XqixxGNBI-`q37 zePB0%4&6yElvD~k4*Rmgbj|^8)9>%SFCs&qlv9vgIUlrTq^vsWI*NwM%&cwd) z?az%yAIw|1>U4?8ZDme6BzyeDyF@tU@a7ig|r? z`;zDMl;VnTMP%fSNH5s&=XF9O5eJ;GXYqqI>ZS(L*N@kunuzt{wdV!^) z@RVI;Ke4I`nm?uNde=}#O(!?=UT%%KS4-H;C;UmiukTk4OHFo?c9vb-BR07s_ja#g z>pX(#X>#I&q%AUHl3q_AaDTk(wkNgLqT}N!f3#h*UHS9?3pU-|lHCMZ>5uMPt2{a4 zmHN+K;)(CerBC!q4%J3wm z*?`l>8|oA0F7H7KR0tsklgJ8tiQWY2X`3e@ zlt3nik`=%@3hn_Ucd~+)mpVb6sHU#zp`zxc*HHEF&{R@YQgqkUM0+U`)CsB{ z>IB5Ucm^7+j@DGwbXNzZp*2<1Row{$RWvAFQ$tAwO(1G0qLtK$>Pj07;{R!4yBYIj z4hsozT_4JIqVy^e0)HWaio;Nt(!Ust|5*bL&gYyGTG(qCm0ALu5pl!K3GLw-1d?%2 zVu0`75cwcts6tS1;9uvm{yGa)pv-NB%~%;VkgoTCCIqCg>q35xzgzeaWr5P!D7k}H zTFPizW%MY|TAum&&YUY|miv#^5g!)$qP86bwDu`y^+MygeHF6-;7BMC1_S`ulBLqw z`)>$w72_k`tL5p}wp#4c@MpQcz-8}JxJAZmWqiWFvy42cQYI00i2Z4A@xCVy3PR+R zEV0$9i#G}sF6w_haaG)3sgt3w5X-U%tM$%w&%*J{ym^r;P%t<{tHU2N?3oRW9*{g@ z#tYDyNOGI&;{z5L=9BgpJ%W*YeKQm8k(|~_aWGon)Gafz9Vho@@f3SU^c%>8P1yBq zFFg_x9Z}LD$~txI-7|I`2A+U)qeZtuql0zZ{n5vHnXv5A%UpHyw|gB#2G3>Mc+b;3 ze-!`zWzj*x8;MA7X2=`dEbLJd>pYLlg!bOE&c)u$MGsb#69n!JH+GcKHp(cp4mUSD zN9B!V_IxI*zjS!wn*ZkdO2qMNO(D7074AQ!C&fFslC8aC3bj`7IUd>ShR@EIlxa@v z4Q3}N$e!O{vCy`+Y{D38DoskzvYr_F;9YS`nm)bR(ZAyI;J1^K^4|`|6SN8Eo;V$W zENPGyNLo7A^St?EZBOhb>u7%X;;{4c>q6uB2i18@F|?40mW!eRB&^QG9N&+UFP8&X z#&x;e9rX;o__YynQDsFHvB_!4W|(ZrZV0!EfjB2mx&XIZFo90&3@1&pTf9C7${oiE zo@f#P!m`kZ+;;gpI&nKg4rL#<%@EF3X&#++5jvR|;ofNaRZlWr+3*pjms>R_u(;#I z?UdjP1bJ7%a2X+jmbq5cr!#L}DG5h>l;3^ij+)dXiDa`C3wJSDAC;^b7u5T=uAbN3 z1l}39SU)^I_F61%FDh2B+OLMoszcU-10KY48ex3QO$T>)ckZao>+Y(ff832>TEg%3 zdo8jt-+3>fb!uA?%3AzoqYa>0|E=?id9uZrkhA?1oMG$dV*u9_C77qmQ=mxrhMqv7CB~ zHvC9I4vw$Va-?Y|-N?$C%Q=;2R{i^sK7y+GqnCLc^lpkT6}ca-Qx|o8;JJq4>lP-U5@_>Z!i=|`j`#txKJ~=K$ zDl*ge?29*bH@mWc0)`m9dE5kJ|D$NV zp~47(msK5^7WMuyb6;f$tH(v|;4*oXjwEC6-bzX|XjPB2=|01xR@QhuYUQ!v4DY+n zv*f$p`~{JPjQ7_pC#Shc7L^?iS*!Uk%7;45lG@2>`CeiQPmOh1Tvgs--d+A=rSnm& zXZ|?5l>L_0$06E)3JNl~o$Lr3GOU#Tt&pZ>v>ymEJ(D>jMF%$USd3Ng+UR`|`J?CieEiE!={-L}J&z7Wh45b@;Nca@or%za)#5nT|Hk#!&YpwCo zwDeDB=2A|z$k5MJA3G>fLd?qTX)R#qezR@(Mt9szy;mQDm}I|>?(WOkR`^tsOEKgC ziRbSAiVt~qcHNKqM#7yck%^`PZHBb)*jgQe_UB@^p-)H+t87X2w@kG*c==lTeuM4w zi9JGeX@>4+(;o8C?BKI$h85!wvk*NAn%SdsPUn$GYftZiygB3W`)*uG{HW@s;}3@< zgvuI^BJtLA7hf(uXJ(V=&Na~!pJ)q9xh1}rIS$9q#x^85jop$_dAR-4LW=bv-m64g z@uB z%un`Om-wH2F908;wl9yTs&iKCyE>c3U2po!-A(vBRY(sgL=8aoDEutsw{~OTiRqM{ z#3xXN)M*O-m*VU2(^M&qpcGvIbx-HJD!&_F+ z?Z~9(cL6`ltaYpY-}ycTpQIXK3^YI~Er41)2twq4&1wo?r7EA83V{{T0Ptg(^dQnv zQ267$;0_v65W$MEvDvXpi|pY;3?v}#Fe*?E4HZKS24f8VFdBQobc8kK7EDvKEHM}| z2CM*HtueBAusl4RQf`jHcyH7rfmZ_$O45{)jl&pVFoxD(b&MTVhBcTrq6+h+w%C-9 z!KhQ@?W4-03NxjYV~p*NpWi=^s9J(*G-jWMQ}>lgO31vS&8)&y&T zhc{)kKs~)F_6GZp3nm6qhTy+xd`wk++;`b~d6@bC`~FkLdqW-}$m*ZuQOxj*{(ql8 zMii@U$lI-?Np?34h$QUN3MgX#_1i*mJIpAtsYLJUTM ziVyJ77>ora%P}y3>}anUwtJUi044no^ZobrrHq%Uy^5!aW~8TsLAblJ1=?|!H3mR1 Z;7I|55HhkHBptu_2RfhtfCs-S{tx(skB9&O literal 0 HcmV?d00001 diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 259999fa0..61c701e1e 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -135,6 +135,17 @@ def request_trees_to_df(tag: str, /) -> pl.DataFrame: return df.select(*sorted(df.columns)) +def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame: + import random + import time + + dfs: list[pl.DataFrame] = [] + for tag in tags: + time.sleep(delay + random.triangular()) + dfs.append(request_trees_to_df(tag)) + return pl.concat(dfs) + + def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: metadata = request_trees_to_df(tag) if not fp.exists(): From fa2c9e73c1e09e9721a2e095e4715e1dfac9939c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 21:58:33 +0100 Subject: [PATCH 011/137] refactor: Renaming, docs, reorganize --- tools/vendor_datasets.py | 146 ++++++++++++++++++++++++++++++--------- 1 file changed, 113 insertions(+), 33 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 61c701e1e..296a5f590 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -9,26 +9,51 @@ import json import os -import sys import tempfile +import warnings from functools import cached_property, partial from pathlib import Path -from typing import Any, Callable, ClassVar, Literal, TypedDict +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict, TypeVar from urllib.request import Request, urlopen -if sys.version_info >= (3, 13): - from typing import TypeIs -else: - from typing_extensions import TypeIs -if sys.version_info >= (3, 10): - from typing import TypeAlias -else: - from typing_extensions import TypeAlias - import polars as pl +if TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 13): + from typing import TypeIs + else: + from typing_extensions import TypeIs + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + from tools.schemapi.utils import OneOrSeq + + _T = TypeVar("_T") + _Guard: TypeAlias = Callable[[Any], TypeIs[_T]] + +_GITHUB_URL = "https://api.github.com/" +_GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/" +_GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/" +_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" +_SUB_DIR = "data" + +def _is_str(obj: Any) -> TypeIs[str]: + return isinstance(obj, str) + + + class GitHubTree(TypedDict): + """ + A single file's metadata within the response of `Get a tree`_. + + .. _Get a tree: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + """ + path: str mode: str type: str @@ -37,7 +62,16 @@ class GitHubTree(TypedDict): url: str -class GitHubTreeResponse(TypedDict): +class GitHubTreesResponse(TypedDict): + """ + Response from `Get a tree`_. + + Describes directory metadata, with files stored in ``"tree"``. + + .. _Get a tree: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + """ + sha: str url: str tree: list[GitHubTree] @@ -45,6 +79,15 @@ class GitHubTreeResponse(TypedDict): class GitHubBlobResponse(TypedDict): + """ + Response from `Get a blob`_. + + Obtained by following ``GitHubTree["url"]``. + + .. _Get a blob: + https://docs.github.com/en/rest/git/blobs?apiVersion=2022-11-28#get-a-blob + """ + content: str sha: str node_id: str @@ -63,37 +106,55 @@ class ParsedTree(TypedDict): ext_supported: bool -class ParsedTreeResponse(TypedDict): +class ParsedTreesResponse(TypedDict): tag: str url: str tree: list[ParsedTree] -_GITHUB_TREE_BASE_URL = "https://api.github.com/repos/vega/vega-datasets/git/trees/" -_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" -_SUB_DIR = "data" +def _request_github(url: str, /, *, raw: bool = False) -> Request: + """ + Wrap a request url with a `personal access token`_ - if set as an env var. + By default the endpoint returns json, specify raw to get blob data. + See `Media types`_. -def request_github(url: str, /) -> Request: + .. _personal access token: + https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens + .. _Media types: + https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types + """ headers = {} if tok := os.environ.get("VEGA_GITHUB_TOKEN"): headers["Authorization"] = tok + if raw: + headers["Accept"] = "application/vnd.github.raw+json" return Request(url, headers=headers) -def request_trees(tag: str, /) -> GitHubTreeResponse: - with urlopen(request_github(f"{_GITHUB_TREE_BASE_URL}{tag}")) as response: - content: GitHubTreeResponse = json.load(response) +def _request_trees(tag: str | Any, /) -> GitHubTreesResponse: + """ + For a given ``tag``, perform 2x requests to get directory metadata. + + Returns response unchanged - but with annotations. + """ + if _is_str(tag): + url = tag if tag.startswith(_GITHUB_TREES_URL) else f"{_GITHUB_TREES_URL}{tag}" + else: + url = tag["trees_url"] + with urlopen(_request_github(url)) as response: + content: GitHubTreesResponse = json.load(response) query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) if data_url := next(query, None): with urlopen(data_url) as response: - data_dir: GitHubTreeResponse = json.load(response) + data_dir: GitHubTreesResponse = json.load(response) return data_dir else: raise FileNotFoundError -def parse_github_tree(tree: GitHubTree, /) -> ParsedTree: +def _parse_tree(tree: GitHubTree, /) -> ParsedTree: + """For a single tree (file) convert to an IR with only relevant properties.""" path = Path(tree["path"]) return ParsedTree( file_name=path.name, @@ -106,17 +167,18 @@ def parse_github_tree(tree: GitHubTree, /) -> ParsedTree: ) -def parse_github_tree_response( - tree: GitHubTreeResponse, /, tag: str -) -> ParsedTreeResponse: - return ParsedTreeResponse( - tag=tag, url=tree["url"], tree=[parse_github_tree(t) for t in tree["tree"]] +def _parse_trees_response( + tree: GitHubTreesResponse, /, tag: str +) -> ParsedTreesResponse: + """For a tree response (directory of files) convert to an IR with only relevant properties.""" + return ParsedTreesResponse( + tag=tag, url=tree["url"], tree=[_parse_tree(t) for t in tree["tree"]] ) def request_trees_to_df(tag: str, /) -> pl.DataFrame: - response = request_trees(tag) - parsed = parse_github_tree_response(response, tag=tag) + response = _request_trees(tag) + parsed = _parse_trees_response(response, tag=tag) df = ( pl.DataFrame(parsed["tree"]) .lazy() @@ -146,13 +208,21 @@ def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame: return pl.concat(dfs) -def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: - metadata = request_trees_to_df(tag) +def _write_parquet( + frame: pl.DataFrame | pl.LazyFrame, fp: Path, /, *, write_schema: bool +) -> None: + """ + Write ``frame`` to ``fp``, with some extra safety. + + When ``write_schema``, an addtional ``...-schema.json`` file is produced + that describes the metadata columns. + """ if not fp.exists(): fp.touch() - metadata.write_parquet(fp, compression="zstd", compression_level=17) + df = frame.lazy().collect() + df.write_parquet(fp, compression="zstd", compression_level=17) if write_schema: - schema = {name: tp.__name__ for name, tp in metadata.schema.to_python().items()} + schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()} fp_schema = fp.with_name(f"{fp.stem}-schema.json") if not fp_schema.exists(): fp_schema.touch() @@ -160,6 +230,16 @@ def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> Non json.dump(schema, f, indent=2) +def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: + """ + Retrieve directory info for a given version ``tag``, writing to ``fp``. + + When ``write_schema``, an addtional ``...-schema.json`` file is produced + that describes the metadata columns. + """ + metadata = request_trees_to_df(tag) + _write_parquet(metadata, fp, write_schema=write_schema) + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From 24cd7d7d9752d7424f9b8e37436d032f31bc54c1 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:02:13 +0100 Subject: [PATCH 012/137] feat: Support collecting release tags See https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags --- tools/vendor_datasets.py | 74 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 296a5f590..0604df780 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -36,14 +36,32 @@ _GITHUB_URL = "https://api.github.com/" _GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/" +_GITHUB_TAGS_URL = f"{_GITHUB_VEGA_DATASETS_URL}tags" _GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/" _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" _SUB_DIR = "data" +_TAGS_MAX_PAGE: Literal[100] = 100 +_SEM_VER_FIELDS: tuple[ + Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] +] = "major", "minor", "patch", "pre_release" + def _is_str(obj: Any) -> TypeIs[str]: return isinstance(obj, str) +class GitHubTag(TypedDict): + name: str + node_id: str + commit: dict[Literal["sha", "url"], str] + zipball_url: str + tarball_url: str + + +class ParsedTag(TypedDict): + tag: str + sha: str + trees_url: str class GitHubTree(TypedDict): @@ -153,6 +171,55 @@ def _request_trees(tag: str | Any, /) -> GitHubTreesResponse: raise FileNotFoundError +def _request_tags(n: int = 30, *, warn_lower: bool) -> list[GitHubTag]: + """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" + if n < 1 or n > _TAGS_MAX_PAGE: + raise ValueError(n) + with urlopen(_request_github(f"{_GITHUB_TAGS_URL}?per_page={n}")) as response: + content: list[GitHubTag] = json.load(response) + if warn_lower and len(content) < n: + earliest = response[-1]["name"] + n_response = len(content) + msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" + warnings.warn(msg, stacklevel=3) + return content + + +def _parse_tag(tag: GitHubTag, /) -> ParsedTag: + sha = tag["commit"]["sha"] + return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{_GITHUB_TREES_URL}{sha}") + + +def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: + """ + Extracts components of a `SemVer`_ string into sortable columns. + + .. _SemVer: + https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions + """ + fields = pl.col(_SEM_VER_FIELDS) + pattern = r"""(?x) + v(?[[:digit:]]*)\. + (?[[:digit:]]*)\. + (?[[:digit:]]*) + (\-next\.)? + (?[[:digit:]]*)? + """ + sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) + return ( + df.lazy() + .with_columns(sem_ver) + .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) + .with_columns(is_pre_release=pl.col("pre_release").is_not_null()) + .collect() + ) + + +def request_tags_to_df(n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame: + response = _request_tags(n=n_head or _TAGS_MAX_PAGE, warn_lower=warn_lower) + return pl.DataFrame([_parse_tag(tag) for tag in response]).pipe(_with_sem_ver) + + def _parse_tree(tree: GitHubTree, /) -> ParsedTree: """For a single tree (file) convert to an IR with only relevant properties.""" path = Path(tree["path"]) @@ -240,6 +307,13 @@ def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> Non metadata = request_trees_to_df(tag) _write_parquet(metadata, fp, write_schema=write_schema) + +def collect_tags( + n_head: int | None, fp: Path, *, warn_lower: bool = False, write_schema: bool = True +): + tags = request_tags_to_df(n_head, warn_lower=warn_lower) + _write_parquet(tags, fp, write_schema=write_schema) + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From 7dd461ff536205b5e07c62b2a4e09ab1e4bf5612 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:05:32 +0100 Subject: [PATCH 013/137] feat: Adds `refresh_tags` - Basic mechanism for discovering new versions - Tries to minimise number of and total size of requests --- tools/_vega_datasets_data/tags-schema.json | 10 ++++++++ tools/_vega_datasets_data/tags.parquet | Bin 0 -> 6210 bytes tools/vendor_datasets.py | 26 +++++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 tools/_vega_datasets_data/tags-schema.json create mode 100644 tools/_vega_datasets_data/tags.parquet diff --git a/tools/_vega_datasets_data/tags-schema.json b/tools/_vega_datasets_data/tags-schema.json new file mode 100644 index 000000000..80f248a66 --- /dev/null +++ b/tools/_vega_datasets_data/tags-schema.json @@ -0,0 +1,10 @@ +{ + "tag": "str", + "sha": "str", + "trees_url": "str", + "major": "int", + "minor": "int", + "patch": "int", + "pre_release": "int", + "is_pre_release": "bool" +} \ No newline at end of file diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/_vega_datasets_data/tags.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dc0ff652ed261eebeed70ead42c0f7352ea4e8c3 GIT binary patch literal 6210 zcmds63pi9;``;oFdYNbQk}%r{U;3o$qm;^Z)+O|M~V~_IlTT*Lv6Qec!cyYp>0U zWy63WI1vxWVKrbUAd0{u6kSM7WS5pEQA9*I>nywL4i0T0cNwb{sg%0Bh)s3;^pobhe%i6lsF0)=)saaFDoSWb_P6?EratJ1OkR3!b@jlc0tVsqD?z7=TnwlH@>#-M3B(EM#v>hAD&Fv$fPpjj<=_m$SAMCv>+9$- z=-O^N%en5_`b=FtEw|-5I!qlsrYnQ1t;5rEXKr_O)6yZS(Y3g)dJL{RkH>IzqwBgd z+_vlUv~}EEwe{V!-Q8T7JUyni)^@Gw(7X_tFAs$WPDm1S;XEPb3w8>{`%YPv6tAA@ z8G)WcLr_YQA#VV2;^-}39HTfeG*kE+)sdB|FLmwLXuCe%07RdQYdKJ|0e{sRHs zK5D5=1daF@vC|gzUccl#tEgbU+G{sZ+FPS`-12duTvmgOSuJa^hsTO_bY{HuZRJBH zHf4q_rnXL=OCox68Q!0j><5mWy}7@mC09o*q~L?D%e_>+(2t)_ayk1~7gpPQSRHx4 ze?+YDcyU3|GtG^bstGRCiB3f&5mo1NJL2kH6%OOY$-c2?oASF}+y0o{<1D+QXJ2F0 zi1|C9D$-l2jz_JfoZ|yko{6!zY zMSIy{k2G*~gP*o)S9b2W+2Hgv_sI`zYg5EI(9_{lereafRV(7G!cz%OcQtD&117QJ z;c-cajxQW?%bNFk$B<1`?vilBhff;JmYnMvLQJ!L!WNO&OWkYPJswb6LkPY3c17r> zY`2;m=l1!x&-7WPpBXtwGbDbut6|q#lZ;-@xUA>c666Zs<^0_C2xhy9JiN0JHsQPslfx!zH3za24=wWzc@uh1ct zeg*e7Jwf@>jg;>Vn@=o%)KQhZVChw*kt<YM)Ti1UHpg$~09XyrhBDYd< zfkNKL-9g;W=5FF>R#b{3=fhQd_9v;Ahb=CzTUPNoyj|mzxpUL<{vsW7(dNyHOX4Cd z``BV{h_AJ9UZK0v>*|xrC6gX!9qi035A7f=XpFsWXlweYFL?W%EZetnk?k9twk%#K z>9nRgyT&(Sb?eoX%}Xwmq_1x@?R|el{Pv_?MW5cpKqax*`rR3$x7jP3aRI8GY6%6Z z0iD^k!oJ4Ec^?LeKU05uQM7ipMj!YZT`L7 zlA0@AA@^#HR!Oe@0l#-%5bUNW`i!)LVi86`}8+?sia{)u{WVzUP$!nKoko_ShT z;n{^Ec6KecHbRn{a`D00Lss$3%6B(jHFFa&j6&i$f-+7CufW1{-wTy3x`UDbh%q@q zh=!gNU7+|*9G#CPU`J`!9)gu`m3#4_Q2TBg1euvY@rDX#XE-XzlgsC*3I$|@NUat^ zN$NksK7i*cE#OMxS3KwH864~%WI&^F{k=3jyn;PLTs3)qKD0o0f4?BwE_V+u|F+DH z8_W%I4-TS%HniYCclRLLoEh?OjPX~w)seVY5jq;uD<$J^VQJ7JFXffz*JD&Tyut0L zvbsZ2vB8dUA#uzr=mmtdypw*|d_GcH_oSyOVfCe47jDk;!WV6|Tip&cHn^sYC^$+8 z|I<9J^iIYK`w= zh(B3gP=-l(` zgG&?iwL`xu2#EJx97?>8uLy`YrVr=c5hw|WH=f39h-ST5rK~%c%gP7|vH_aXeP+xM( zNAi_#tsj<&o|3K9yS>ADQ`_F9`V&xLF9$po&@R;#bR>av2?+u&AF9H^B zxVtMMI8{Wo>Em!NU}4oOpZd>?=Cps;yzxy!K4IK8^|^)LXtfMy&C?ev?^~&;^-Oh# zQP-~?lh}!rM;4B&l$tWB*4-4*sOXofS$@Nd^RJ18$L*yZpTJ+fn%b#yr~gBk=z-?T z8^@faA9r0BI&Fq8ii{TpEZjvL&6HETkTrJOzU*^CXlves{I}havO4=c582a5qa8<& z=Z>Z){#vINP?q+fLUDahRj}+~dR)iQBlk6CC?QJ)EaoLeJkE^lRN$HB8n=l{8NcQ< zN%rM0_i9uBo-Oz0b!TUDT*|Yj8EIW5rt(jETb3SrU|Jin(d^&}QPOj{RY=sRUP03O zq)VpvTsFL_%aVUlBlG0dxbn@T+x%K?M1SWsx!a7L7Vs_}5OPD55piZxz5wm zDkK`RlL>2Zd-2OwmMs!1!C2!JC#I$*<@QWsxLG8L_#yk8W6#>|$1+?uOCXK&q+bg) zZzIt);zxhH@pCmIA+ew9O5eFRw-Xz<8H>eDQ?)3c_gq2(z4?Sa#zDrUsfmkF z!u}f+_y3DVVH_+ED|0|0cwLnztb-sd3lps{HHm}39)iGX=ng`dJ^|pdpe1^d@R%zi{+*AT%D@P43(prlX=|x|=CDl*2{ZAs_T3fUmZdpb4THeC ztZfTvmP=l~0+%icBQ;p+ID_xfw*~<&S;3kdSZvxOd9@G-iiyF-GKf>3-`tX$!k~fH zfQ`Qfj1%DeBVeR_I?zI*`1*C05YD7t9U?%KSuywL2J<}mF@K6dx-rzFI(&fNG710^ zU=jz$&H%*ALJ$l3WBTpXW`g%eRA~Bz0cK$n*bst*W@07oAL#B9=knPk- zChEPw2$_f_&zxY%B*B>$SU@0%ubcB3k4&2LtJ^H|e}g^d`hr9z{f$B9N>Q_1n&AR6 zX=Yi^R!=WEpq^eLf1;kApnsyCHiW9}zGMpnkx3Ygpgje_!yQL4vPB}*rXYjeCa^I?Ez_QDWA1wrauWrT!9s0}>CUq87q$�JYJ< zw~@&F-Nx*>J2$>ZCQ*-4*(jRF1{AQA=gj5EVKdORoee8ONPaX1aiZdpWQH`l{7Dil z8=DRG20=VecONdPUWA5TS{|k>7Rwy`So*s_Jn|xW z<11M#8h^fmlmi3kBc7J0=bz2Lx#L5pB%AJF&tpL(5i37$ZeS4k0NC=r`3I@Y10TP@ He-Hi#`swv< literal 0 HcmV?d00001 diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 0604df780..86deec8ee 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -314,6 +314,32 @@ def collect_tags( tags = request_tags_to_df(n_head, warn_lower=warn_lower) _write_parquet(tags, fp, write_schema=write_schema) + +def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame: + if fp.exists(): + print("Checking for new tags") + prev = pl.read_parquet(fp) + prev_latest = prev.sort(_SEM_VER_FIELDS, descending=True).head(1) + curr_latest = request_tags_to_df(1) + if curr_latest.equals(prev_latest): + print(f"Already up-to-date {fp!s}") + return prev + else: + # Work out how far behind? + print(f"Refreshing {fp!s}") + fresh = ( + pl.concat((request_tags_to_df(limit_new), prev), how="vertical") + .unique("sha") + .sort(_SEM_VER_FIELDS, descending=True) + ) + _write_parquet(fresh, fp, write_schema=True) + print(f"Collected {fresh.height - prev.height} new tags") + return fresh + else: + print(f"Initializing {fp!s}") + collect_tags(None, fp) + return pl.read_parquet(fp) + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From 9768495f9974173ecb6b835464174a7b0bea808b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:20:03 +0100 Subject: [PATCH 014/137] feat(DRAFT): Adds `url_from` Experimenting with querying the url cache w/ expressions --- tools/vendor_datasets.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 86deec8ee..65802d130 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -13,7 +13,7 @@ import warnings from functools import cached_property, partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict, TypeVar +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypedDict from urllib.request import Request, urlopen import polars as pl @@ -31,8 +31,9 @@ from typing_extensions import TypeAlias from tools.schemapi.utils import OneOrSeq - _T = TypeVar("_T") - _Guard: TypeAlias = Callable[[Any], TypeIs[_T]] + +_ItemSlice: TypeAlias = "tuple[int | None, int | str | None]" +"""Query result scalar selection.""" _GITHUB_URL = "https://api.github.com/" _GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/" @@ -340,6 +341,28 @@ def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame: collect_tags(None, fp) return pl.read_parquet(fp) + +def url_from( + fp: Path, + *predicates: OneOrSeq[str | pl.Expr], + item: _ItemSlice = (0, "url_npm"), + **constraints: Any, +) -> str: + """Querying multi-version trees metadata for `npm` url to fetch.""" + if fp.suffix != ".parquet": + raise NotImplementedError(fp.suffix) + items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() + if items.is_empty(): + msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" + raise NotImplementedError(msg) + r = items.item(*item) + if _is_str(r): + return r + else: + msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." + raise TypeError(msg) + + # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From c38c235fec976be66f7298f484e83828f2edf8a0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:31:21 +0100 Subject: [PATCH 015/137] fix: Wrap all requests with auth --- tools/vendor_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 65802d130..c5ad91459 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -165,7 +165,7 @@ def _request_trees(tag: str | Any, /) -> GitHubTreesResponse: content: GitHubTreesResponse = json.load(response) query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) if data_url := next(query, None): - with urlopen(data_url) as response: + with urlopen(_request_github(data_url)) as response: data_dir: GitHubTreesResponse = json.load(response) return data_dir else: From a22cc8a2d8231d0ac56117c3cd2fc56a2cffe762 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 9 Oct 2024 21:16:45 +0100 Subject: [PATCH 016/137] chore: Remove `DATASET_NAMES_USED` --- tools/vendor_datasets.py | 45 ---------------------------------------- 1 file changed, 45 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index c5ad91459..167c55590 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -421,51 +421,6 @@ def __repr__(self) -> str: ) -DATASET_NAMES_USED = ( - "airports", - "anscombe", - "barley", - "cars", - "co2_concentration", - "countries", - "disasters", - "driving", - "earthquakes", - "flights_2k", - "flights_5k", - "flights_airport", - "gapminder_health_income", - "github", - "income", - "iowa_electricity", - "iris", - "jobs", - "londonBoroughs", - "londonCentroids", - "londonTubeLines", - "monarchs", - "movies", - "normal_2d", - "ohlc", - "population", - "population_engineers_hurricanes", - "seattle_weather", - "sp500", - "stocks", - "unemployment", - "unemployment_across_industries", - "us_10m", - "us_employment", - "us_state_capitals", - "us_unemployment", - "wheat", - "windvectors", - "world_110m", - "zipcodes", -) -"""Every name that is referenced in *at least* one example/test.""" - - DATASETS_JSON = { # "7zip": {"filename": "7zip.png", "format": "png"}, "airports": {"filename": "airports.csv", "format": "csv"}, From 1181860ca6fa4abcd8662b0c9f5de2257b041b76 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:01:24 +0100 Subject: [PATCH 017/137] feat: Major `GitHub` rewrite, handle rate limiting - `metadata_full.parquet` stores **all known** file metadata - `GitHub.refresh()` to maintain integrity in a safe manner - Roughly 3000 rows - Single release: **9kb** vs 46 releases: **21kb** --- .../metadata_full-schema.json | 12 + .../_vega_datasets_data/metadata_full.parquet | Bin 0 -> 21362 bytes tools/vendor_datasets.py | 668 +++++++++++++----- 3 files changed, 488 insertions(+), 192 deletions(-) create mode 100644 tools/_vega_datasets_data/metadata_full-schema.json create mode 100644 tools/_vega_datasets_data/metadata_full.parquet diff --git a/tools/_vega_datasets_data/metadata_full-schema.json b/tools/_vega_datasets_data/metadata_full-schema.json new file mode 100644 index 000000000..2b5b9d955 --- /dev/null +++ b/tools/_vega_datasets_data/metadata_full-schema.json @@ -0,0 +1,12 @@ +{ + "ext_supported": "bool", + "file_name": "str", + "name_collision": "bool", + "name_js": "str", + "name_py": "str", + "size": "int", + "suffix": "str", + "tag": "str", + "url_github": "str", + "url_npm": "str" +} \ No newline at end of file diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata_full.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7a4e691cb414735738f276950d79e8c72c5f4b48 GIT binary patch literal 21362 zcmeIa1y~i`{x-Zf-Mu&6-JQ}6(hVZr(xHGzcXvytA}NhD(j|&?hltWj!?!o;<8#jW zpXYnN_q^YAz2A48VKXzUe)F3(Yu)#nd)ADaga#)F0GfaYeFCz8lG#BB&_5DB00R(6 z00cl5hI;mE6B~dwEI=9#77|Wo2R?)(Am7~HbZ7w8lqi6(Hxx=ZK=_+wl@4g^vi!cR zw4^BfjUO12#MgnhjKjvl1Vp@9_G9Rv0pe1au)3?K3UI@s>%;awM3W<|amXJ|l$y{$ zSrD=_@zPRz*9`&?f)EG^&@DaPjomz)ot<3WEiFK3;AD1iGy=E?h7L*iw}&7&P!0j; z0!0Z_00jMV7!&?E2#`u;O2v`jy4VORu1v1?pqiCVeebV^+D+}mbQ9sf&le{9^(z{y z9>4&g325o;N<)QnbD+KurSa$Eo=tpvdhCOt+t6(fGkpHBqbyas)>$$aX6DL~%_}%3 zH2uqIshdgA+%{ZZo!+^{EH574eQ$@46zh>|%OgUnFsy~uU-CSK$>KlpMAOf9Q(DeD1K}0Rue>^HSV=F z7| zR%@)fS;}`sS2w!*RyF43`B5oq7P^6jBK4AwkkFN0;*yeAg$6$J}V z2UTdVp6W8_0nhPZoW`wOw{_BdT^#ggyB+q=dHNY{DOmg_zQ$2YwHWZEE<#E;B4yw%gmoxbHiD&UFaj`>h6!fD>OdJP9}Rh08B(-vd7 zhuexF_rfi^^mzEt;-b2E7f)erMD~UoPr-Zo_sbf-kiOW-2&zTD>x)>w`XZXQt+Tn4 zg{2#tx!Yp`Bz{Pc&DqiVS7SFpentd%&G@r-B7TGS_csw|AixL@HtVtxAd3&TkSy0@ zOOLkd+3^(3i?@+x%1|jww(GNNHmXjymwh?w*p%(qzkBZ6nddCeLKG)2F3v%LhlVIE zS!2ROf+;V}m9NEuuOwTy>Ml&KCeK5aqC~HySZ`Ku$gZcvSNOr6-%zD-{hN=NiJAZn zI7W_!f`R}GB@~t_Uyp+*7LkT3LyajFm9B8sSttjSfrcnSu>_Z?*r3Ls5}$=GYtpWP zn5}gF(7TMb=678V*3d=D@|jfDWc9~LJ;i2*bWpEPMNKsrv}m|Mg#8?95f z=-NVBwS5T;>ZYu*5vBqHyZqHSNU)Im0w8f z4_ZVfwZ@>kWU&^r&MNoTaXnXK{1ExU^$j_P0F7i62L%Hy)XMx&}sZS@I8qk z2T5=ywE!}&9OP{2a<}B`%?54Ozxhhr93rtnTA77ytQK=>^j4dQn~>4I+>U1^^@m8> z!q|_O*!g|yfzU#d6l&58QjC~wI2=;S60ZOrGOq;&Y)F+(2)h=5a{$01r{BgN8YKFOVi0}y%(v<}CF1LA6biMMF6SICMsgI$b{w5XHjr?+37`C= zOAM%!#^z4;_O@=ePL4m^J%9r*0O=}8|B)~&u|avrN}$Z&gsBV5ivyUZHmBrpnZqPf zz+tX!PhCvSD#1pJkVq|exDB!~%|AQbYcn#?1sDUg0qM5ttXx`1ms+Ml%qfou$4~R? zyavGs5Z2TRW4>p3%NMwLwQH<2jI1uub@^_qv9qu+vyk@kp@*!1+GQppUsRG)x0QU= zSH-I={cTP(?#^1NKvM$$3Vpq`Cj9{AI<>*;C_Ka)ftW1%)DN`*V%CM4P6R>g+?IZl zGVVmt{&6Qn)ACa}e$^=exF!LIg6(ey5;1meHEsl<`f6{sG7vFXuy%oF5qfmEiUF2u z{#WGm)Ic)7vH-PP%;%;A2>ox87r!Q@)_l<38S&vQ(%c2u%-hMTN!^ZpYNUd-UXI$ld z>QU9Lj-6y*gk>14#vIz=ma3;<6^w6G9Z}lk#(Dk1(q{LWsYXD*ZulA&I}~5|lNW+(7FimZvt*s~(@HreVgN z`L3ZC4<$p&Y!V4c;19OK#KBXHlr8#69y<22nscepL4_oxn;s7g3^>NWGoo;WLL$HD z_vk}-8Iu8zD{QSyUX6jVhEUn|!5W0q$9r6~Xc!nZ$kiAuFp45aZ4~EmMAs9ke$gp2 zUm>Hz{qtlz2rc<17Bra%0BL3Da*7y=4f?Dkx2oAJ6e2ZVXxo>(W_4+Xj2Bkj6AO~< zKv~EcJ!^Mdq>yJ?V`dXh$HK;<9SW-^%OG8bD2>>8X%dc3nK_)C&)7Q zg$_)xF$n{w)qlO}b9_v|DN(y)&OwbtvM zwBiK05eG}io*F%p!YGA;wNy63jB|v(lx7o!j$xqqYMxO*d~5RL@fsEO4pKEO(0NlI zZ5Ig(0X{8z)?{=@Eep&n_qbOttwaGf@f~Z4uVGKO+G~%>J=3hZb}o*5Z|y&*3Goi_ z?c9D6(><_1KHVpBDANB+XM@}%K`=l2pD~{ng87AJloSbbEcP;!Abs`H@7pEFKzJCA zgOppm6>DG^if?QJH!BqLwSHs1L=a>0BLXPq+y0N3ulyVH&7qht$yW9(1~#j2kl&QM z=;LGRo6(A6f>1(}X38!OH1P^;fq#$rXKP0pwPN{HEoicxa^C37MYiMjW)uN1hS_|R z$KGdovh~d~f@nP~Wk$y%u!x}WZwF=KOgMu@c~maqH*rbv08GF1f53bR$gR>Cm7RdY zm(}6(SF6D%8or+cZ|UywRS}c(R$2|^GM!nuf%V^okrM0R7ByymTZf!_Zf<*UIwL;*j9)R{1Bu_fS2%qK}8HCQb4pIHS($eP#FMt2GHVo3G>PjWE2X>M6w05BLG)hK%%$+ z{a$Q)VMO~5xCd>36+HwXGT;NdFd^dyn7S}Pd5GXBa*Ygt6thZ0VGam{2hbvIo&jCo zX^oI!H^4L50~63*zyOr^Nfz(`?jOa>1Yz4b0g0TjEg}GfSll`qxTO}r$w%O)vz`MO zTU}T;U)&NDfHMgU2__&F6vP5_K?Kax!=yTs&!(lY09y4O5$t?$7oOh9o&ei2EDiD% zOssCq=*mo#T`SRR|5t4vO`A>N#yKe;a#&!8%pO517An`YVE)rk7g9}-@ht#AfCM@B zU(214YJ&Wn3?R+%SHs=^1*s+xBGu$u{G(LEBJx|R>9hE2sb&x&)u?g(kZO3Ce@Hc{ zf0AmM~>8 zcFu!8zrvvUUZI_Z5Nv(qFE3uG_(Qx&vbqy*#)`M8(IZNJiZ>$;Q1PZ##{nYVqzEvZ znTUJRVTQx;yeOiabZ>TOws>tY{bCfN;I!db%Y3<0aK`;*Q+_Hq0%#kKmRh&B!=Rk?93E4(#P(xm-VCTnRALKR<)p0y&$09i>gp&{*VTiQX z4ZTI*#oGuiG#GLjus-_mhn|x&2+I$QVPYb;wGm>UnQ+f-yq^tP*lJzbDcfq;Oq}f2 zf5|6SmEK>rA6WWB-ASK^s5_|Euja1X%t&BY=<*i;eX?`z zUrOD!G|o)N^}<2`s4C~2Hn{4aOn<&3-?uOX)qi^R50L`0{Fw+bmr#RnSqm+pnviv0;n5d}A#I9In&gZ+*tV`W56hd- zkx>Y~gtAl^AbjI06gZupGspq*w}?9+YWhzGk`k2R?vX#iS$hSOsZJ+#7L&Wm27d_H zuY94j2xLdJ`679Q^d=B)d|gi(Spw64A~fquHpPkt ztX5*uboWAGwSR)EEiz)TeN*1p>hKEkA;p$^*Qd{82I4y__KDd9!FI6C^r>C$(m^;_ zrSG)3IT-Ogi>ixAFij=G0MXaW#Ga6jcDI5%7GpGwgTKX?H| z_geK1p8qFEfDFp5K#ocvv|o=cD7Za3Zq*i2nrot{7d=ni`x!_uMx z_;7`3ogIX(WI9&kY)JPGu9shOC00>H@G$k^nlU6McLha$YyG53jlu=s0&oJx8DBfU ze?3o@Df>vwCHH08!w?o=NZ z$yGEwmiV){P>qa(eJr0%n_{5sX?!%V>bSI*#j}KstRtjdWltHW1>;XV*LC*H#wB(> z(>x7jgo|`!b@a^RtfOJJrQuDh!7&UsPi=_lL(yFXt#-|651i&U5bR_Wg|`X|7;Uca?N1UC;W zD_hSWY~6$lu12K0u!J^;f4V-9hH!-l`VLeA8UAVtCO3c>SO8Za23Ga1yEeg^hlA@d z5Ap7S4Xk5kP$EiYoGpgp=kHIILPS$5uG>$#zui~3l(To@mdktF-LdnTPPNj(8wTo!NLUes*lrs~N-9%WTZ@q()7~4iKZ#YIEEgPQmi%6I z#+!*WYkpmxB}$W9ZcQ(1Pa+#>`m{6hQ9EYk%f$xsXCOWfwN)h4RoA$Tv1VnY7FmLk zfoP4zmiYXmMtws3{N%MaXOdHxK4RU>0CX*{* z(zkQJe#gw&bNBh+9vnJ-HlccW{y7%)%v^OvrwKM%H9R~5qGKN79H0y+me&c8fML(` z2f!JcpbI!t_y@qi&`bmd0?=?sNJuEe{W}q&0_YgjMuweR;mn6XgHP(Fi}8_h*$eoo zd%5sI9JEM_&xzIj@+?Es3Mp!6U%k04i=?DErr9YyE6d?4wHd2dF*e?gu$+veMR*>b zX-MFxnw;#cE@*iXccXXusFnq%yD^whN8uf^^|zx0MT6O)(4@Nj29a`FbRAS48bZPK zdcnOV<>&Ylj9Gk1+I0s@W<3-sXK<8U*nP53WAhq9kg2x@1&tNoL}RyH5o+>(0ZU5& zU#4#cmf4683f*up%Dz;iJU)aQ?4VCqY9Mfjo! zj6BaDE)DCaIbwcO;#Gvq^%DOZrQn$%j>lmYO|!Lh=jYr{#<}U*ww2$7FxzGq@RO6r zKy%EZWzt%0jJa@bmYHSO>iv|WulGz`^3^;KwtM(9)IE1oo<6O}G|1hpEc0BNH?j8X z8&Yn6N@6T(HpX*xSQp)EAhVN-N2S#O!>Ek+Rso}!Ot4CkF?6vy@qtKoj%_7f+EcG| z&&gBo!)^T}osHRG)hnN!L;{COP=QN_Fy1{8H5kMpCiL0WnX)~iM2_qD?}^$0UR56* zp2A;`E8DRha9b-sucdXb%`s!Rj~vx7ltSzn@>L*lV*lRjwt*&F8{M~!o%G-nldf#$ z+A{oSMH!c=A8RT)FfuaUCx;zoBqp6s6N_>biK;a9p{Trzd1+G5!1y-Em%2f+sGTLM zxjp7cVp;PflHB(+>)v&%g?{;Te)LU*avf}I@CB_+RGh#f)#>_E_LrE zNg&(PGpyAkpUnqeW&OpaM+wbbyn}{KY$8qp$0Mry%Uryf-dQY$1o&n*TF3<6;WMPv zZ)5c2pLrITrKY>!rZyQPz?L$+boW_qHYi7_Gz%Af)A=6Ba8M`tF)0G>^?u+K{wV6B znTngzY7=Am=PB@*+BGjj#iiO!cinsz9~M08^CF2#5K|ko(ik^Dpf4A>>O1&KKIVP( zvLH2hz+!5IsoeoNx!i_1>bz_8=pcJ`ccNLEoWtGg5DvUoY)};UHQT_wMHk)n9{zW; zntQ8>5Bemvw|D&>ACu{694oZUtAx!x9LRB2G03nrN)Z?XskRn{v)7U~d$7&3RRnv2 zi^~Yydh0(kgb8`Lm!!ry>4vd8DDOJrW+{@)*_|DS4qvARNzN z!e>^rx{sf@c)+npFV{k@@Ldj_VPRuzHAj)or`dTH?1w8`{``%EsfA!J_U}ns52jD$ z3E6t)U*jr!?kh}Ysy)*j$gHCCTU;u2MWCvz%1rAh!s^yxZaJOnT{V20Zm{(NtjCW} ztyhbbg)YeZt-<(7Q$_XqgzWWFuvZqR#kf4Tc};1OI0~)etXqcZq?a^U6q`TKx}XBhf~!g~S5{K8~sJp!Uc z+~x=!w#q3T*CdnQIg^6)+r0!U1{3IVu>g8(6@%iXN}5lc1)8Z>8U7sP~VH9=^f zs~LyA=(>`oDG9Qk>74!lESi|vxVt;M39+-AI@_{Y+q&C$n6a5VIk3B0Iyi1_7Zl*);V`un;N;-qv*hRc zb+h$fq30i`=KQ}b^nCk6=rQ{x^w@pJ`2H&8@98{_P@Shi$I-WwB2|Fd?jLlXS*Xr4 zH$4Q^d3qr_&)5GOou?J5^UNDGLvOcbyGSlX=lSx7&eL_L z^9*nE+@)=;LY9;Nz6cPfu2YoWudh3ua~57$?SG3~s3me1)X z_I8PM@F7Fa)ezOE6}CYYkB0aghHxtuYX0NSwlSPiUxR&_Ivf%-Kw*d&ntS-|Va~w3 zIEf4(nh$%G-srIe<<$!MgR@AP9-LHo44ksF7%%HI{830gf^8y$~K(2V^8T! z5+Q&jJ`!Ozg-IFkMC)C{t4AoXMkpw>;bB??c}o(O=QL!&!Vkq;$7YduU>g_(`NHg? z48tv*g$laxEkK376gz!*UHTI|WRfsreR*G719#hNWr7MAw5 zk6qauE#282ogMxWsKtMX2kcPbX8$z_h*v5Kd~>vG2iGU$BlmY>G<|#qbR685=$%PSr_7FdAS+ zT%Cy{*`^_BTkj6zEpLs6%%<>N6it7~s$8x(w3|^ol|Hx2;%$g}C&)r+<%WFt!D}*P zF3H_vAS*@jSYMH7tfc&PNn$LQw1NcLZgWm5>^!KjC$uGOn(@{7TsPjUes4PLkfZ*U zRb`l0$`2e8r%IO8n3y%}`mG(w>>~n&)m`z1fc198kaNX?VW5X>V1W4E3WtKpLpzz8 z=DM^v2HO2ETlkruy_1p3u0Xj(Wci%j>7_7~q5NT*TBXO}fo9CiLe_DC(5El4PLYc> zlPi|`8!ywgMv02uOx&tOJ=NEfbbK7fD#yOq4aLpnN&9@nn|F;*r2RHIEgwNT^MaB3 zYh!uSY(a&Mlya7`@rQ{-2mQ2c%`lG;d8K~D$xKu!Z`bJ=)Z|PyK}%Xi#-V7RDcg8E z8hJ^DXM3Zci5jM4H-fZ1JWQtK1#H<~FzJ2BYDr{6xP>fxe_k-Y{0MPz|J%h4q3|J- z@SH4y^jjF%e05F$HfO+K{D8Qjp$%R4Y55XnkxDaor=y_>UnuTz}P%yVKA?;O@6>8VukJ$be)+0q}>k9vm1r0FntJfPx|b5`+OzH0=XC zak3LyFP&9SbJ!!3HKk*j_fcizgm|k6P8{i?)%zxa@XO}j!Z|g@iYtC>1KHxi5htR_ z7}U2zGX&2pj(o3!xG03aMOoxj;~(90sms86`V3rF)?_;>;9Rn{iCl`lvC?ojgV)zZ zNcf((JuLQwba^@rEI9Yf*S+^VH}v?LbA9|aw;24v73=C=e`&w&p?S=E`0k{>Q{qnz z4BPqL^tHE2Qx9g3_YcO<2g1|J=ZlWAJgMErs_7ft68DP+Rd>s+E@^rAOlb2QPFs*R9(ghVu7)LA7h8G|Lz}Jh(kkojjaC+V3$v6@(cpZ_f}rHBL=0dmVN# zF4U2;9O>({gb{C)mUS!smP!mC6c^t91pVXCZ0pn$K5;4B^e^}AKkgTP#=XT>Da}zh zXGHmGBx-Rp35Zslz9*Qi=_h(XMb{Of3FP8~i+ZUK??mpa`?i1RGj8%WBHWSDp7{6U zq=Fk0^@=H}w@LV8amcYF=HmgiAG;$XwLZZXlOv8!Stm#(B?Z%4ya-`vO$MUG4m4`Y zbzztHcDLHTTH`FT#p?G9rIL*N!)n=}Ftx?@5GF4@bl34<6 z!o;-^mzqSK1-=S@jvN&m(W27;IsPbryk4(g>N?V)SM%zFNs-N~ZUn8N@e_}Cg+a3e z@*ub5(ahnnoG7|>pdvu79X5PeFnqFsO2B8W#qvtsom`Fl3x)gU7|2uDj-2pz?EOAx zNqa8aweGlTh`HFMdxG4B`)ZM02^ieJmqPK#C5ltgupAJ@wXC?~VfLO_^^b;(nWyFF7K$P&?@ zfy$z%(3no~GWpUaSZ)sGyy!-bcq6$@jE$34x4OyTqlDrhtn1L6cq+LM*UOAH?{V>_{QHa7yc+^#eeFr7L|jgikB4}CGv%oo5>&f{d$O~{ zc$4W2)7WE$@We9wa2)Sr5<7Hy!dxk~VnwJm35ur(B_B}Kb1iiHjEA-!NRPr(axN03 z3Q&0(iT64;iLX_x5FQILBxA7pBpPvBq&LML!m>q{#-v&K)^>G^4Aia7rAJ$xs-Y;!qCPXTs~Argja6i@qZP6*ow-0YR#0??WL!w zkbUs}lKL67qNiBV(w^8r^7F2 z-4pl3kd$t7ne$f8kmN;g-Y!PsEkO(4)pT&7uB4GKnH__uMH|hzktgpn z7eCid)=e4d^CzBkUW$+X(~k{@3Jlw1HW8Q7O^IsWm5t+I_G4^#7WbtTzIf-i3VXQ-FA)-6s}w-8`;E1bc)zi8o(Rr?&EK=qcV7v+6~V!gL6(5)URG z-q^`~5@MVjBJ3GPq_+UWf!2cY0+8UfIw3k$Rr4 ztS~*V6kSe~7t0X0iQ^m+>m@;?1smdoE7^oD+T@phK;JzEe5zuPNbyFSGr6EII(+6 zxvWZY0;F_1+zX|N!wNTYpY|i|Z%AR(M2KFHNh8>UuO&}%b=wQcmxZ%2ltwVKf%!9e zT5{9IYo753)8XPO5&$DcFJFT?-I#zM6{{hy?d`4^L$(i#dVS&}v27~&W{ z?Q|1+sGH=g5bH$(?h!l^2`%%UhOZ zk$6GXG}Jl7SfVFo2~Y2#EYAyG%nh8{9~ay7y~LhTkj>VItL|NVTVA&6C4D&D`USGIh+@vVzt{gg>906VL1zUC=`3Ir^NZKbt z5SCCsZkp7er3SlO29_yP99(p*li?lse53Y$_IA~+?(>`O>TE%FrW4CGC#k*k>iEYo z;GbPrO)u{uTE?Z@a}ftZ z;EHF{x#A{Wg>`7bS$M5u&=lOQGovjc(`DjSgK$OjH(hBnH zK3&8e{q1VFM&W!h7`|3DE)(-ypPu0s_0gr26n|%UOBt(K`C1rXHjA{dmtXY=v9Ne_ zJbQz{4}4}_GG>ZD+WXC5iI~q66FVve%ZLyrRhYD#4!j0V>CtRf75EI-Ukq_OE;4_n zHWRIiO1H=KL?3DW((?VP{C*m_XY18QY|JdeI|>ifM|=a9x<`LkGvt-3IWeg zFJ8&di|D(FL|#mhgK7Ggt3cjQXy>e~x>HneyRvzhbzTBZU)m(ox0W$ctnp|+QMgB> zeQ0cdt#uL-zM}O2cRAhL!%=&v%uV)1rs&Y^f+E|Z+3*6c6F!c5bv)E4xES0Wjl*^n(@BJ8gDi@e7Zw^<7V24D-Eu8DhG_;YhY|i8;f7=!|g1+R6i{HuCSI8rAk4jn7!BHR_`O)i~+Ld zS_FNwn`_1`2#CX0%O$YD6gsHJEH*51sL_QD7%F&{ z5W_DaOstZWjh8FG+GS2fbFA8vb{q$CwGfRs5Re_5pj506_~w11Ry&LN0y)=$3e>LO zUJJ|KC&&3rwXN=?aCLmNCyeUhP)fL2*_B^7>$eW+G=5U4wiqSutI<4`%P3*280=#= z{x)`@u$(70H_c8`z?Ya=lNuyORX!W2>@*L7J@_iZq?h1kNe0=*7MjmOtpQ#IAOjuEYeKKC3$5 z^iL{{VRZ0mWH|WuzIuvEl7y<>EY=b!uM3?vdAwV+{ zERXHq!i3!@1j^=ikE0;;Nn*q|v=0rSO zc~-kyH=;OO=AV2=i?t+Lj#UTc=Azoii3m0+!b6*0tODwIB zgxhBbpv2fd(rJoUH!;)GbrIjjc-GYpFNsz!r`x6|iX6<6xM$OeHmM-f-o;E@%%yPV!WZW?qI{q-QAo&30s1k%9%h;o?MSs~c?d2i`=4cP(aLQ6q@5kI$8-N(@9XE>zQ{d2z4xNupzX3!Axgm#M`Ii@t=Ao%uFMd|05NBL zN;g_5{FLK*KX$9|yQ*T>7uMdaIFrn|ks&YHY;SW=_#uzpzBP+kuWqb45YQYm{W9U~ zdCz3zE%4A^$dJJ!PDynt>!2p@IXYRDyqiJ=1${m#tt5hU9&BZDtPpvXtZjJk6Cm%} z8X^JJ2$ETNB2eyz;<@Q!9#eKARywBvrIJ5^Oymr8;xfg? zQBGH~V=8|Y1b=|Gen!h%xixnWQ9OxrMR5xmc58X%*}ZoU-iEPDU1=UHq;$}@h!qed z+Q1$;X~hc2KFRkHFR~>Z1+d{u={dLI98rBkq>vG+L`pI#ArwVLB0Ut!4`bjenb z<;+3$2I)QEtw2dHnp!VqL%OW85%2>728M;m25P9mjuL=FQKnOhL>5HHwl|Rh08l^Z+9XjCa`KuhV zQrWy-6Pt*;B`IFerEqEc8u@yHVHw`S{4F=|&3pa=Oj7PO=T@mF>K9#yT-P?8=;0N% z!GVNBGl!WEq(^qS-)7rndoOK&h8e{pK@Y69sMqzE*WC))3S2xZq|X}ATQ_J7kmS4# z3X(TsUcCZ*{0c)$gVVe+)L0q1W=2eCTZAxNiH{t+$}*N#fg+Yul@`=793F~oUV;Z> z<6m%1>0=bxj^enrvySTR92GtEVLyNU;blX>WVOec~pU}@AmyY zD~q30uek@OFLd|J^eJ8q8yT;=mAl+NoNB{!vKpPwe~gwQV7YXWAgt6+>*MxevO>5Z z>gt;2;YX@ZXuZbj`^2K)aq{9}{Ts?fz{i05aba;iLPfnYO;jZHr0<$U7W}e-Cpw

*1L@E^lrhJ;~e%ORkvV6Zr`B(p%srH zkEFJ-rg20+HuCqhZHs=tAZg-G*o#Hr#A^Y09cy%9VdKalWA!x7KpGazcROp2ehk?J z4ch^y!-q6gTu)G4IIDy}-rgn?GQQ|KTsYXT8E7#m{njwI3#p7K>^Bt4M<(+(5Cwgd z*vnC0%4_A>zeetay+ZhEPf%o#oD*EV`J~sIfL%J&G-RPSPXJ$0x-;Fv z)^;kc>RsBTzCUJc^a!K7*^r-u+-6CUwHnTp$l#mp;sMn6UYvVP-mKz?eTY&!hcv-H zS!cB5Y4T^eqRKda=rnWUh^dbB(&&9sexR3YhZ|3=1X6HI8M?$Q#@?}SS2huvF3UZi zUriV0biA)fsVAZx?&%p1zRpTT2EbHPP zW|_iROztbX@88o?Vu%wB@{S~meRl8^ROv)redbGB2R;Sfb{T6BBBGeWMjRR2QHxuA zmb^_Rf0K+T>jm@?$R@o-W;VB07q0t^upZ*Ck?T}HOkAWO0@K96Z|uhq7VOc1RT-a@Gw<0X>uE0^lh80Z#F%qBLNZF~Je(b}*$^-w3e{(@$Yk zm`OIqV#ui+9>}?ae>Qu3phH6N0)8c*#jq{5C+mEx)OxR7V_|A={1dt=)z%xD^8hln z_q@|_w3}LoX3_2Kd7-*lz6tiO3|KWf`k(pZZrs1*e3*EemrDwx#}DWJlr<#RnykmO zYDTaKuPNcf*zK1}MJKvzw~4N9!N?%>>7tYUSW1!G(x=XmDj*&Ci2&yqJ%^No4 zAe+gNYqN`NYy5$%y^)OM`55V&(l*eZb>QurEhz_8%vMwq=`J%hnPSk&!AKq9b5IuS zdl5_vbT8Ge6Vs<0-@{KcNal(2>!KU1{H*NnBQh+f;P;juj30w^p9W>Gh!m`JGwy_V zGm8q6FlJjF@-`O;#IFhowNm;ui47KE8VbJ$S@2$K(C`z`cs!$+aek;tah{(eYo$Ve z5b7f=rs(@%uZJvdc@r_(oLPk*PyDdv3T9UNU~Wz9^w=|N&4Q%<>|iy^^_Jwh()UC? z`wL@r;Pdao<1c_tXAwN(mXoMjE@rWuzAO*uBl*{N)G3R6r*^X7qSiz5IEOVlY)`g1 zw<{2qV6VSmbt*ku{J zqm-_X3_U5qQ5Y+^G-3rkl3#`*KxbVaGWeY0+UNVR((3dy0q@#K~l&n}w7Ku;|N!M$$M&Gaq(JQ*JPtB`Q`u z4Od|;rlO?GdHZmqMN3Z$<0A)@N#r^7E*88te2=uv_?whp3yc)5KFES8N2=}$?caHJ zgCr4`{Pap$XACJ{)NORw_7{|oXkp=g&8zE7*Iq(T zI?x)J@?Fke;Wep6kBjA&MtQeKTrbypA#~?5+^UO`L_Cm=2_!G{y#aE@&z(fTLArby zXOC-#*TFXBzV7aH;1i568_e)={@zAw1!u7v9FE#F;VkK)$2$y=XM(f<$oFaxp^pS1 z&%b^=4-6e81`pWNbyowS4+Q^Dd0<9}4MPa|un*)PG7pvz3;^6k{P9T~&PkH|$>5n&JASHjziT_i{pHtQe3GQY{|0&_u)MG+|pR;@al=dGy zZ2h~_Itc%hxg&q~iSD0H=|6rnN=Wd(qs9I7hy;uTDuRJD762g$I1dAqi1Bm3k`%N6 z{JCGo2?JWlao6L4*&wt!u-=Gp&%2%A#uOI&_N4OB7qiz=KQCxVf@JY>y7DO8KK30#5sHY zii3m1-Mz)}qsY%W{J&D6C7`p;{~Z3)RDeH6|AK>`ageW95t0yA&{e>YK&ticf8Po` zR0abAgV%ClKvV#_S`v~!rXc)2@gIouw?0h*UV)e+00-y6fMfx5k>tOe`KS8&Vex-+IFlTy|%qApg!t zyd}y1(n~nt&O0kJ5G$epx)%<=UHQ9jZ-8g+ETk5|fVcp3UtE7%_`9zsfDi603_vUh z0O-~#fWJ-rscrqJ*Z+S`GXaM~rq&RF{R?40C9sfY`5e+o{y)U3;DS5%OG;oMI0*pf z^pHb(I~@}EAQ1)+3Yr9H5*iw+xkB_!M=AQ>QAbqQuQNcxi}&~ya}3G1KvoFH3Cb7)Ww8m%HBDIp=H4#_T|eP=@* z5|+L*W_wrSmwgEdeyBaKJ9~G=*B8e$KU z1dWo{@iLPY^e~s>_>b*LLB054&(Ty*!^uJyaz3he$8&>v00}`(_0CmES;#KL2~DUU zkgdkg`cQ^uft;@Z#7TCD|BxhTlq83~uK9oL-`%0*RX7bC44f??R`2T1`_7-cl^pcY z5;D;0`iuJ0f_Nr)mlKi;vRFf}7Q|0$XnjM@U&Yzd0eT7kH(eh&UK=xA9V>JBNB{Zx zL$CKwd!~-c|IQw?8vf$^f43iLXs!LUr_UwmW+rFvZK^Bi0l9u|(CeoJN&eaR^r5>F z8juQuUQ}yHeL=VCKbnN}U7~~p`(6Kl99lv`5gKLSVDDz2Bj};8%V`e{|C{#y_x*)l yFF73^3t2&L3q8pvX55M#2D<7J00IOxCwo&@H^K_Y$m1`5fDy= (3, 13): from typing import TypeIs else: from typing_extensions import TypeIs + if sys.version_info >= (3, 11): + from typing import LiteralString, Required + else: + from typing_extensions import LiteralString, Required if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias from tools.schemapi.utils import OneOrSeq + _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) + _PathName: TypeAlias = Literal["dir", "tags", "trees"] + -_ItemSlice: TypeAlias = "tuple[int | None, int | str | None]" +_ItemSlice: TypeAlias = ( + "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" +) """Query result scalar selection.""" -_GITHUB_URL = "https://api.github.com/" -_GITHUB_VEGA_DATASETS_URL = f"{_GITHUB_URL}repos/vega/vega-datasets/" -_GITHUB_TAGS_URL = f"{_GITHUB_VEGA_DATASETS_URL}tags" -_GITHUB_TREES_URL = f"{_GITHUB_VEGA_DATASETS_URL}git/trees/" _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" _SUB_DIR = "data" -_TAGS_MAX_PAGE: Literal[100] = 100 _SEM_VER_FIELDS: tuple[ Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] ] = "major", "minor", "patch", "pre_release" @@ -51,6 +74,14 @@ def _is_str(obj: Any) -> TypeIs[str]: return isinstance(obj, str) +class GitHubUrl(NamedTuple): + BASE: LiteralString + RATE: LiteralString + REPO: LiteralString + TAGS: LiteralString + TREES: LiteralString + + class GitHubTag(TypedDict): name: str node_id: str @@ -65,6 +96,14 @@ class ParsedTag(TypedDict): trees_url: str +class ReParsedTag(ParsedTag): + major: int + minor: int + patch: int + pre_release: int | None + is_pre_release: bool + + class GitHubTree(TypedDict): """ A single file's metadata within the response of `Get a tree`_. @@ -97,24 +136,6 @@ class GitHubTreesResponse(TypedDict): truncated: bool -class GitHubBlobResponse(TypedDict): - """ - Response from `Get a blob`_. - - Obtained by following ``GitHubTree["url"]``. - - .. _Get a blob: - https://docs.github.com/en/rest/git/blobs?apiVersion=2022-11-28#get-a-blob - """ - - content: str - sha: str - node_id: str - size: int | None - encoding: str - url: str - - class ParsedTree(TypedDict): file_name: str name_js: str @@ -123,6 +144,11 @@ class ParsedTree(TypedDict): size: int url: str ext_supported: bool + tag: str + + +class QueryTree(ParsedTree, total=False): + name_js: Required[str] class ParsedTreesResponse(TypedDict): @@ -131,64 +157,442 @@ class ParsedTreesResponse(TypedDict): tree: list[ParsedTree] -def _request_github(url: str, /, *, raw: bool = False) -> Request: +class GitHubRateLimit(TypedDict): + limit: int + used: int + remaining: int + reset: int + + +class ParsedRateLimit(GitHubRateLimit): + reset_time: time.struct_time + is_limited: bool + is_auth: bool + + +class GitHubRateLimitResources(TypedDict, total=False): + """ + A subset of response from `Get rate limit status for the authenticated user`_. + + .. _Get rate limit status for the authenticated user: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user + """ + + core: Required[GitHubRateLimit] + search: Required[GitHubRateLimit] + graphql: GitHubRateLimit + integration_manifest: GitHubRateLimit + code_search: GitHubRateLimit + + +class _ErrorHandler(urllib.request.BaseHandler): + """ + Adds `rate limit`_ info to a forbidden error. + + .. _rate limit: + https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28 + """ + + def http_error_default( + self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message + ): + if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)): + limit = hdrs.get("X-RateLimit-Limit", "") + remaining = hdrs.get("X-RateLimit-Remaining", "") + msg = ( + f"{msg}\n\nFailed to balance rate limit.\n" + f"{limit=}, {remaining=}\n" + f"Reset: {time.localtime(int(reset))!r}" + ) + raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp) + + +class _GitHubRequestNamespace: + """ + Fetching resources from the `GitHub API`_. + + .. _GitHub API: + https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28 """ - Wrap a request url with a `personal access token`_ - if set as an env var. - By default the endpoint returns json, specify raw to get blob data. - See `Media types`_. + _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN" + _TAGS_MAX_PAGE: Literal[100] = 100 + _VERSION: LiteralString = "2022-11-28" + _UNAUTH_RATE_LIMIT: Literal[60] = 60 + _TAGS_COST: Literal[1] = 1 + _TREES_COST: Literal[2] = 2 + _UNAUTH_DELAY: Literal[5] = 5 + _AUTH_DELAY: Literal[1] = 1 + _UNAUTH_TREES_LIMIT: Literal[10] = 10 + + def __init__(self, gh: _GitHub, /) -> None: + self._gh = gh + + @property + def url(self) -> GitHubUrl: + return self._gh.url + + def rate_limit(self) -> GitHubRateLimitResources: + """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user.""" + with self._gh._opener.open(self._request(self.url.RATE)) as response: + content: GitHubRateLimitResources = json.load(response)["resources"] + return content + + def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: + """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" + if n < 1 or n > self._TAGS_MAX_PAGE: + raise ValueError(n) + req = self._request(f"{self.url.TAGS}?per_page={n}") + with self._gh._opener.open(req) as response: + content: list[GitHubTag] = json.load(response) + if warn_lower and len(content) < n: + earliest = response[-1]["name"] + n_response = len(content) + msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" + warnings.warn(msg, stacklevel=3) + return content + + def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: + """ + For a given ``tag``, perform **2x requests** to get directory metadata. + + Returns response unchanged - but with annotations. + """ + if _is_str(tag): + url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" + else: + url = tag["trees_url"] + with self._gh._opener.open(self._request(url)) as response: + content: GitHubTreesResponse = json.load(response) + query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) + if data_url := next(query, None): + with self._gh._opener.open(self._request(data_url)) as response: + data_dir: GitHubTreesResponse = json.load(response) + return data_dir + else: + raise FileNotFoundError + + def _request(self, url: str, /, *, raw: bool = False) -> Request: + """ + Wrap a request url with a `personal access token`_ - if set as an env var. + + By default the endpoint returns json, specify raw to get blob data. + See `Media types`_. - .. _personal access token: + .. _personal access token: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens - .. _Media types: + .. _Media types: https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types + """ + headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} + if tok := os.environ.get(self._ENV_VAR): + headers["Authorization"] = ( + tok if tok.startswith("Bearer ") else f"Bearer {tok}" + ) + if raw: + headers["Accept"] = "application/vnd.github.raw+json" + return urllib.request.Request(url, headers=headers) + + +class _GitHubParseNamespace: + """ + Transform responses into intermediate representations. + + Where relevant: + - Adding cheap to compute metadata + - Dropping information that we don't need for the task """ - headers = {} - if tok := os.environ.get("VEGA_GITHUB_TOKEN"): - headers["Authorization"] = tok - if raw: - headers["Accept"] = "application/vnd.github.raw+json" - return Request(url, headers=headers) + def __init__(self, gh: _GitHub, /) -> None: + self._gh = gh -def _request_trees(tag: str | Any, /) -> GitHubTreesResponse: + @property + def url(self) -> GitHubUrl: + return self._gh.url + + def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit: + core = rate_limit["core"] + reset = core["reset"] + return ParsedRateLimit( + **core, + reset_time=time.localtime(reset), + is_limited=core["remaining"] == 0, + is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT, + ) + + def tag(self, tag: GitHubTag, /) -> ParsedTag: + sha = tag["commit"]["sha"] + return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}") + + def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]: + return [self.tag(t) for t in tags] + + def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree: + """For a single tree (file) convert to an IR with only relevant properties.""" + path = Path(tree["path"]) + return ParsedTree( + file_name=path.name, + name_js=path.stem, + name_py=_js_to_py(path.stem), + suffix=path.suffix, + size=tree["size"], + url=tree["url"], + ext_supported=is_ext_supported(path.suffix), + tag=tag, + ) + + def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]: + """For a tree response (directory of files) convert to an IR with only relevant properties.""" + return [self.tree(t, tag) for t in tree["tree"]] + + +class _GitHubQueryNamespace: + """**WIP** Interfacing with the cached metadata.""" + + def __init__(self, gh: _GitHub, /) -> None: + self._gh = gh + + @property + def paths(self) -> dict[_PathName, Path]: + return self._gh._paths + + def url_from( + self, + *predicates: OneOrSeq[str | pl.Expr], + item: _ItemSlice = (0, "url_npm"), + **constraints: Any, + ) -> str: + """Querying multi-version trees metadata for `npm` url to fetch.""" + fp = self.paths["trees"] + if fp.suffix != ".parquet": + raise NotImplementedError(fp.suffix) + items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() + if items.is_empty(): + msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" + raise NotImplementedError(msg) + r = items.item(*item) + if _is_str(r): + return r + else: + msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." + raise TypeError(msg) + + +class _GitHub: """ - For a given ``tag``, perform 2x requests to get directory metadata. + Primary interface with the GitHub API. + + Maintains up-to-date metadata, describing **every** available dataset across **all known** releases. + + - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. + - Organizes distinct groups of operations into property accessor namespaces. + + + .. _tags: + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags + .. _trees: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + .. _rate_limit: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - Returns response unchanged - but with annotations. """ - if _is_str(tag): - url = tag if tag.startswith(_GITHUB_TREES_URL) else f"{_GITHUB_TREES_URL}{tag}" - else: - url = tag["trees_url"] - with urlopen(_request_github(url)) as response: - content: GitHubTreesResponse = json.load(response) - query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) - if data_url := next(query, None): - with urlopen(_request_github(data_url)) as response: - data_dir: GitHubTreesResponse = json.load(response) - return data_dir - else: - raise FileNotFoundError + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) + + def __init__( + self, + output_dir: Path, + name_tags: str, + name_trees: str, + *, + write_schema: bool, + base_url: LiteralString = "https://api.github.com/", + ) -> None: + # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced + # that describes column types - in a non-binary format. + self._write_schema: bool = write_schema + output_dir.mkdir(exist_ok=True) + self._paths: dict[_PathName, Path] = { + "dir": output_dir, + "tags": output_dir / f"{name_tags}.parquet", + "trees": output_dir / f"{name_trees}.parquet", + } + repo = f"{base_url}repos/vega/vega-datasets/" + self._url = GitHubUrl( + BASE=base_url, + RATE=f"{base_url}rate_limit", + REPO=repo, + TAGS=f"{repo}tags", + TREES=f"{repo}git/trees/", + ) -def _request_tags(n: int = 30, *, warn_lower: bool) -> list[GitHubTag]: - """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" - if n < 1 or n > _TAGS_MAX_PAGE: - raise ValueError(n) - with urlopen(_request_github(f"{_GITHUB_TAGS_URL}?per_page={n}")) as response: - content: list[GitHubTag] = json.load(response) - if warn_lower and len(content) < n: - earliest = response[-1]["name"] - n_response = len(content) - msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" - warnings.warn(msg, stacklevel=3) - return content + @property + def req(self) -> _GitHubRequestNamespace: + return _GitHubRequestNamespace(self) + @property + def parse(self) -> _GitHubParseNamespace: + return _GitHubParseNamespace(self) -def _parse_tag(tag: GitHubTag, /) -> ParsedTag: - sha = tag["commit"]["sha"] - return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{_GITHUB_TREES_URL}{sha}") + @property + def query(self) -> _GitHubQueryNamespace: + return _GitHubQueryNamespace(self) + + @property + def url(self) -> GitHubUrl: + return self._url + + def rate_limit(self) -> ParsedRateLimit: + return self.parse.rate_limit(self.req.rate_limit()) + + def tags(self, n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame: + tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) + return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver) + + def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: + """Retrieve directory info for a given version ``tag``.""" + trees = self.req.trees(tag) + tag_v = _tag_from(tag) if _is_str(tag) else tag["tag"] + parsed = self.parse.trees(trees, tag=tag_v) + df = ( + pl.DataFrame(parsed) + .lazy() + .rename({"url": "url_github"}) + .with_columns(name_collision=pl.col("name_py").is_duplicated()) + .with_columns( + url_npm=pl.concat_str( + pl.lit(_NPM_BASE_URL), + pl.col("tag"), + pl.lit(f"/{_SUB_DIR}/"), + pl.col("file_name"), + ) + ) + .collect() + ) + return df.select(*sorted(df.columns)) + + def refresh( + self, fp_tags: Path | None = None, fp_trees: Path | None = None + ) -> pl.DataFrame: + """ + Use known tags to discover and update missing trees metadata. + + Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. + """ + rate_limit = self.rate_limit() + if rate_limit["is_limited"]: + raise NotImplementedError(rate_limit) + fp_tags = fp_tags or self._paths["tags"] + fp_trees = fp_trees or self._paths["trees"] + IS_AUTH = rate_limit["is_auth"] + UNAUTH_LIMIT = self.req._UNAUTH_TREES_LIMIT + + tags = ( + self._refresh_tags(fp_tags) + if IS_AUTH or rate_limit["remaining"] > UNAUTH_LIMIT + else pl.read_parquet(fp_tags) + ) + trees = pl.read_parquet(fp_trees) + + missing_trees = tags.join( + trees.select(pl.col("tag").unique()), on="tag", how="anti" + ) + if missing_trees.is_empty(): + print(f"Already up-to-date {fp_trees!s}") + return trees + else: + missing = ( + ReParsedTag(**row) + for row in islice( + missing_trees.iter_rows(named=True), + None if IS_AUTH else UNAUTH_LIMIT, + ) + ) + fresh_rows = self._trees_batched(missing) + print( + f"Finished collection.\n" + f"Writing {fresh_rows.height} new rows to {fp_trees!s}" + ) + refreshed = pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver) + _write_parquet(refreshed, fp_trees, write_schema=self._write_schema) + return refreshed + + def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: + rate_limit = self.rate_limit() + if rate_limit["is_limited"]: + raise NotImplementedError(rate_limit) + elif not isinstance(tags, Sequence): + tags = tuple(tags) + req = self.req + n = len(tags) + cost = req._TREES_COST * n + if rate_limit["remaining"] < cost: + raise NotImplementedError(rate_limit, cost) + delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY + print( + f"Collecting metadata for {n} missing releases.\n" + f"Using {delay_secs=} between requests ..." + ) + dfs: list[pl.DataFrame] = [] + for tag in tags: + time.sleep(delay_secs + random.triangular()) + dfs.append(self.trees(tag)) + return pl.concat(dfs) + + def _refresh_tags( + self, fp: Path | None = None, *, limit_new: int | None = None + ) -> pl.DataFrame: + n_new_tags: int = 0 + fp = fp or self._paths["tags"] + if not fp.exists(): + print(f"Initializing {fp!s}") + tags = self.tags(limit_new) + n_new_tags = tags.height + else: + print("Checking for new tags") + prev = pl.scan_parquet(fp) + curr_latest = self.tags(1) + if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): + print(f"Already up-to-date {fp!s}") + return prev.collect() + else: + print(f"Refreshing {fp!s}") + prev_eager = prev.collect() + tags = ( + pl.concat((self.tags(limit_new), prev_eager), how="vertical") + .unique("sha") + .pipe(_sort_sem_ver) + ) + n_new_tags = tags.height - prev_eager.height + print(f"Collected {n_new_tags} new tags") + _write_parquet(tags, fp, write_schema=self._write_schema) + return tags + + +GitHub = _GitHub( + Path(__file__).parent / "_vega_datasets_data", + name_trees="metadata_full", + name_tags="tags", + write_schema=True, +) + +####################################################################################### + + +def _tag_from(s: str, /) -> str: + # - Actual tag + # - Trees url (using ref name) + # - npm url (works w/o the `v` prefix) + trees_url = GitHub.url.TREES + if s.startswith("v"): + return s + elif s.startswith(trees_url): + return s.replace(trees_url, "") + elif s.startswith(_NPM_BASE_URL): + s, _ = s.replace(_NPM_BASE_URL, "").split("/") + return s if s.startswith("v") else f"v{s}" + else: + raise TypeError(s) def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: @@ -216,64 +620,9 @@ def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: ) -def request_tags_to_df(n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame: - response = _request_tags(n=n_head or _TAGS_MAX_PAGE, warn_lower=warn_lower) - return pl.DataFrame([_parse_tag(tag) for tag in response]).pipe(_with_sem_ver) - - -def _parse_tree(tree: GitHubTree, /) -> ParsedTree: - """For a single tree (file) convert to an IR with only relevant properties.""" - path = Path(tree["path"]) - return ParsedTree( - file_name=path.name, - name_js=path.stem, - name_py=_js_to_py(path.stem), - suffix=path.suffix, - size=tree["size"], - url=tree["url"], - ext_supported=is_ext_supported(path.suffix), - ) - - -def _parse_trees_response( - tree: GitHubTreesResponse, /, tag: str -) -> ParsedTreesResponse: - """For a tree response (directory of files) convert to an IR with only relevant properties.""" - return ParsedTreesResponse( - tag=tag, url=tree["url"], tree=[_parse_tree(t) for t in tree["tree"]] - ) - - -def request_trees_to_df(tag: str, /) -> pl.DataFrame: - response = _request_trees(tag) - parsed = _parse_trees_response(response, tag=tag) - df = ( - pl.DataFrame(parsed["tree"]) - .lazy() - .rename({"url": "url_github"}) - .with_columns(name_collision=pl.col("name_py").is_duplicated(), tag=pl.lit(tag)) - .with_columns( - url_npm=pl.concat_str( - pl.lit(_NPM_BASE_URL), - pl.col("tag"), - pl.lit(f"/{_SUB_DIR}/"), - pl.col("file_name"), - ) - ) - .collect() - ) - return df.select(*sorted(df.columns)) - - -def request_trees_to_df_batched(*tags: str, delay: int = 5) -> pl.DataFrame: - import random - import time - - dfs: list[pl.DataFrame] = [] - for tag in tags: - time.sleep(delay + random.triangular()) - dfs.append(request_trees_to_df(tag)) - return pl.concat(dfs) +def _sort_sem_ver(frame: _Frame, /) -> _Frame: + """Sort ``frame``, displaying in descending release order.""" + return frame.sort(_SEM_VER_FIELDS, descending=True) def _write_parquet( @@ -298,71 +647,6 @@ def _write_parquet( json.dump(schema, f, indent=2) -def collect_metadata(tag: str, /, fp: Path, *, write_schema: bool = True) -> None: - """ - Retrieve directory info for a given version ``tag``, writing to ``fp``. - - When ``write_schema``, an addtional ``...-schema.json`` file is produced - that describes the metadata columns. - """ - metadata = request_trees_to_df(tag) - _write_parquet(metadata, fp, write_schema=write_schema) - - -def collect_tags( - n_head: int | None, fp: Path, *, warn_lower: bool = False, write_schema: bool = True -): - tags = request_tags_to_df(n_head, warn_lower=warn_lower) - _write_parquet(tags, fp, write_schema=write_schema) - - -def refresh_tags(fp: Path, *, limit_new: int = 10) -> pl.DataFrame: - if fp.exists(): - print("Checking for new tags") - prev = pl.read_parquet(fp) - prev_latest = prev.sort(_SEM_VER_FIELDS, descending=True).head(1) - curr_latest = request_tags_to_df(1) - if curr_latest.equals(prev_latest): - print(f"Already up-to-date {fp!s}") - return prev - else: - # Work out how far behind? - print(f"Refreshing {fp!s}") - fresh = ( - pl.concat((request_tags_to_df(limit_new), prev), how="vertical") - .unique("sha") - .sort(_SEM_VER_FIELDS, descending=True) - ) - _write_parquet(fresh, fp, write_schema=True) - print(f"Collected {fresh.height - prev.height} new tags") - return fresh - else: - print(f"Initializing {fp!s}") - collect_tags(None, fp) - return pl.read_parquet(fp) - - -def url_from( - fp: Path, - *predicates: OneOrSeq[str | pl.Expr], - item: _ItemSlice = (0, "url_npm"), - **constraints: Any, -) -> str: - """Querying multi-version trees metadata for `npm` url to fetch.""" - if fp.suffix != ".parquet": - raise NotImplementedError(fp.suffix) - items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() - if items.is_empty(): - msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" - raise NotImplementedError(msg) - r = items.item(*item) - if _is_str(r): - return r - else: - msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." - raise TypeError(msg) - - # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago From 31eeb2042a6cfae6c2ca95874797b61e339e41d8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:26:47 +0100 Subject: [PATCH 018/137] feat(DRAFT): Partial implement `data("name")` --- tools/vendor_datasets.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 6bb0d0216..d02ef5130 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -824,5 +824,38 @@ def __getattr__(self, name: str) -> Dataset: def __dir__(self) -> list[str]: return self.list_datasets() + # BUG: # 1.6.0 exists on GH but not npm? + def __call__( + self, + name: str, + ext: ExtSupported | None = None, + /, + tag: LiteralString | Literal["latest"] | None = None, + ): + """ + **WIP** Will be using this *instead of* attribute access. + + - Original supports this as well + - Will only be using the actual (js_name) + - Some have hyphens, others underscores + """ + constraints: dict[Literal["tag", "suffix"], str] = {} + if tag == "latest": + raise NotImplementedError(tag) + elif tag is not None: + constraints["tag"] = tag + if name.endswith(get_args(ExtSupported)): + name, suffix = name.rsplit(".", maxsplit=1) + suffix = "." + suffix + else: + suffix = ext + if suffix is not None: + if not is_ext_supported(suffix): + raise TypeError(suffix) + else: + constraints["suffix"] = suffix + q = QueryTree(name_js=name, **constraints) + return GitHub.query.url_from(**q) + data = DataLoader() From 511a8455f9caa285a7220bf989f6d607a704f070 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 11 Oct 2024 17:28:01 +0100 Subject: [PATCH 019/137] fix(typing): Resolve some `mypy` errors --- tools/vendor_datasets.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index d02ef5130..2c0f47a90 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -10,6 +10,7 @@ import json import os import random +import sys import tempfile import time import urllib.request @@ -24,18 +25,23 @@ Callable, ClassVar, Iterable, + Iterator, Literal, NamedTuple, Sequence, - TypedDict, + cast, get_args, ) from urllib.request import urlopen import polars as pl +if sys.version_info >= (3, 14): + from typing import TypedDict +else: + from typing_extensions import TypedDict + if TYPE_CHECKING: - import sys from email.message import Message from typing import MutableMapping, TypeVar from urllib.request import OpenerDirector, Request @@ -147,8 +153,15 @@ class ParsedTree(TypedDict): tag: str -class QueryTree(ParsedTree, total=False): +class QueryTree(TypedDict, total=False): + file_name: str name_js: Required[str] + name_py: str + suffix: str + size: int + url: str + ext_supported: bool + tag: str class ParsedTreesResponse(TypedDict): @@ -501,13 +514,10 @@ def refresh( print(f"Already up-to-date {fp_trees!s}") return trees else: - missing = ( - ReParsedTag(**row) - for row in islice( - missing_trees.iter_rows(named=True), - None if IS_AUTH else UNAUTH_LIMIT, - ) + it = islice( + missing_trees.iter_rows(named=True), None if IS_AUTH else UNAUTH_LIMIT ) + missing = cast("Iterator[ReParsedTag]", it) fresh_rows = self._trees_batched(missing) print( f"Finished collection.\n" @@ -847,14 +857,16 @@ def __call__( if name.endswith(get_args(ExtSupported)): name, suffix = name.rsplit(".", maxsplit=1) suffix = "." + suffix - else: - suffix = ext - if suffix is not None: if not is_ext_supported(suffix): raise TypeError(suffix) else: constraints["suffix"] = suffix - q = QueryTree(name_js=name, **constraints) + elif ext is not None: + if not is_ext_supported(ext): + raise TypeError(ext) + else: + constraints["suffix"] = ext + q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] return GitHub.query.url_from(**q) From a770ba9247300809cc18c6a6863cb38c0c7819f5 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:27:35 +0100 Subject: [PATCH 020/137] fix(ruff): Apply `3.8` fixes https://github.com/vega/altair/actions/runs/11495437283/job/31994955413 --- tools/vendor_datasets.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 2c0f47a90..dc31cc61e 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -15,6 +15,7 @@ import time import urllib.request import warnings +from collections.abc import Iterable, Iterator, Sequence from functools import cached_property, partial from itertools import islice from pathlib import Path @@ -24,11 +25,8 @@ Any, Callable, ClassVar, - Iterable, - Iterator, Literal, NamedTuple, - Sequence, cast, get_args, ) @@ -42,8 +40,9 @@ from typing_extensions import TypedDict if TYPE_CHECKING: + from collections.abc import MutableMapping from email.message import Message - from typing import MutableMapping, TypeVar + from typing import TypeVar from urllib.request import OpenerDirector, Request if sys.version_info >= (3, 13): From 686a48599f86cffb49549d72e697c88aa4440d45 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:31:28 +0100 Subject: [PATCH 021/137] docs(typing): Add `WorkInProgress` marker to `data(...)` - Still undecided exactly how this functionality should work - Need to resolve `npm` tags != `gh` tags issue as well --- tools/vendor_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index dc31cc61e..ad8debbc5 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -61,6 +61,7 @@ _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) _PathName: TypeAlias = Literal["dir", "tags", "trees"] + WorkInProgress: TypeAlias = Any _ItemSlice: TypeAlias = ( @@ -840,7 +841,7 @@ def __call__( ext: ExtSupported | None = None, /, tag: LiteralString | Literal["latest"] | None = None, - ): + ) -> WorkInProgress: """ **WIP** Will be using this *instead of* attribute access. From 0bbf2e9ec2ff2f1d79b4d4f68128625daab2d947 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 5 Nov 2024 19:42:18 +0000 Subject: [PATCH 022/137] feat(DRAFT): Add a source for available `npm` versions --- .../_vega_datasets_data/tags_npm-schema.json | 9 +++ tools/_vega_datasets_data/tags_npm.parquet | Bin 0 -> 3114 bytes tools/vendor_datasets.py | 56 +++++++++++++++++- 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 tools/_vega_datasets_data/tags_npm-schema.json create mode 100644 tools/_vega_datasets_data/tags_npm.parquet diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/_vega_datasets_data/tags_npm-schema.json new file mode 100644 index 000000000..8de9881a0 --- /dev/null +++ b/tools/_vega_datasets_data/tags_npm-schema.json @@ -0,0 +1,9 @@ +{ + "tag": "str", + "major": "int", + "minor": "int", + "patch": "int", + "pre_release": "int", + "is_pre_release": "bool", + "v_tag": "str" +} \ No newline at end of file diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/_vega_datasets_data/tags_npm.parquet new file mode 100644 index 0000000000000000000000000000000000000000..38be9c271c7638490835298d6fff114c9e921a7c GIT binary patch literal 3114 zcmcIndrVVT96q-%Dim#X=XkHF6Hz;u2nCUqh3PI2d03=WMIJM}+O?1(q!28~W<*>z zgU?i>lWc@Jooqvw>9UZ>7*j^ns4+U3n^QM3G`<-7V;ZO3@7%Un1QyKZO?%Jp+}}Ch z?>pZ;=WA-8PN}`NwMf6SB`dMBtx24H;>-SBa}0yOvR&I#4rYah zZpfNk+*0s>T#UMH`63h}T!HJEFd(SJGAvWapb%+Pcx? z!7V^CCC7@Z3N2Vc9OUN!vQ46q$Bj}BL<_>R_0V`rE7K7Y@yz;}uE!Dtj0p}IMp#aH z(TkNc?ttE@+Cc$1FcmruhKXVpBP{U{k(L#fmfHm?Ee!dZ(F{r`mr;>`$$%J}^mySA zqxRvQ4*Ct8MU|?K;?ZX+* z@bF~io`5z2W9@pc&QjG4^|G|kkDbOIHkjr+AO5noKhIKk@7Q0L(>%3q?moY^-)C7R zt3Au=kdosfjnhw9)@bEmSp_&sj}=3!(ryH?#NvkdV5RH+ck9+Y`f~=ATZhwzlb2F5 zaE&CdzV4M|j_ z#37=(oe0%$5O0j1sd7CywdnD{)qEF2scNv!hpH(=E^{*-h3b?&2>C&F!t3}xqeEgT zx@PxBPZ$PN*v`39ws^0U8 z<5L$S4$C-pVo7EG(*?6hsitodmBj-CLj3KIKO1Hz9gJCmin>+G<~vROeXgPfcb4q2 zt{{(hEZty~9XJFYl6vKvwnKS(QfaWid~9{{n6EM%Lt1BF^h- z9P4&Dke?tVWQM$hV~1?OH|^O;2-4o;G)CIHiN{EL_asf5+_f1%Wf-6FUZsGrTRTrq zO-_sOX$CkWPE8I=ajhqG%JFgU<1GQW$s6|t;!Q*GBwXqlJ}?EJ6CPs`*O(%)mzP)J zi3ls^hy;(GgY;@BfhEv)TctOq{^E&+BkYcS4-1?_nDx|#c0;>gl?R+SI5k(N*)35& z)6WBGS0fP3}sEEhl@a4ft0=(TEIM#7!oLhI}S08%QQuV4<|lq&f(zrwt!V*QXUi$O==_ zC>4Ov(|&PEnk{>Nnyn}~%UT?@!p*-{kV3do;($sk{tN%*P$X6u38Ub$@Clhzc#B7$ zc&jOSX*taZ`uyMc$&nRBWwABS5#L(qi~k??i?D@KjMm&W{A-An8Om*i_DZe}ymaIT Mt%ctS1N=+)2T| TypeIs[str]: @@ -142,6 +144,30 @@ class GitHubTreesResponse(TypedDict): truncated: bool +class NpmVersion(TypedDict): + version: str + links: dict[Literal["self", "entrypoints", "stats"], str] + + +class NpmPackageMetadataResponse(TypedDict): + """ + Response from `Get package metadata`_. + + Using: + + headers={"Accept": "application/json"} + + .. _Get package metadata: + https://data.jsdelivr.com/v1/packages/npm/vega-datasets + """ + + type: str + name: str + tags: dict[Literal["canary", "next", "latest"], str] + versions: list[NpmVersion] + links: dict[Literal["stats"], str] + + class ParsedTree(TypedDict): file_name: str name_js: str @@ -589,6 +615,31 @@ def _refresh_tags( ####################################################################################### +def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame: + """ + Request, parse npm tags metadata. + + Notes + ----- + - Ignores canary releases + - Github tag is stored as `"v_tag"` + - npm tag is `"tag"` + """ + req = urllib.request.Request( + _NPM_METADATA_URL, headers={"Accept": "application/json"} + ) + with urllib.request.urlopen(req) as response: + content: NpmPackageMetadataResponse = json.load(response) + versions = [ + v["version"] for v in content["versions"] if _CANARY not in v["version"] + ] + return ( + pl.DataFrame({"tag": versions}) + .pipe(_with_sem_ver) + .with_columns(v_tag=pl.concat_str(pl.lit("v"), pl.col("tag"))) + ) + + def _tag_from(s: str, /) -> str: # - Actual tag # - Trees url (using ref name) @@ -614,10 +665,10 @@ def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: """ fields = pl.col(_SEM_VER_FIELDS) pattern = r"""(?x) - v(?[[:digit:]]*)\. + v?(?[[:digit:]]*)\. (?[[:digit:]]*)\. (?[[:digit:]]*) - (\-next\.)? + (\-(next)?(beta)?\.)? (?[[:digit:]]*)? """ sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) @@ -835,6 +886,7 @@ def __dir__(self) -> list[str]: return self.list_datasets() # BUG: # 1.6.0 exists on GH but not npm? + # https://www.jsdelivr.com/docs/data.jsdelivr.com#overview def __call__( self, name: str, From 9c386e26515b23b0bccbe5505ed9c9bbcb05b96c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:41:23 +0000 Subject: [PATCH 023/137] refactor: Bake `"v"` prefix into `tags_npm` --- .../_vega_datasets_data/tags_npm-schema.json | 3 +-- tools/_vega_datasets_data/tags_npm.parquet | Bin 3114 -> 2596 bytes tools/vendor_datasets.py | 16 ++++++++-------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/_vega_datasets_data/tags_npm-schema.json index 8de9881a0..90ea9d52e 100644 --- a/tools/_vega_datasets_data/tags_npm-schema.json +++ b/tools/_vega_datasets_data/tags_npm-schema.json @@ -4,6 +4,5 @@ "minor": "int", "patch": "int", "pre_release": "int", - "is_pre_release": "bool", - "v_tag": "str" + "is_pre_release": "bool" } \ No newline at end of file diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/_vega_datasets_data/tags_npm.parquet index 38be9c271c7638490835298d6fff114c9e921a7c..d2e9a34b78eef3da66b7b70e82ed4a6dcf0a5502 100644 GIT binary patch delta 834 zcmaJ=O=uHA6rN2s+ikKT*6a+sWoxmAP!RK1tyP-f#G0f<+a{?^Q@kV_n^HFho7#pb z>dAwkmjMwefL^$SLDORvT8sv*Yo{0f@bQldvtI^=uU|8!~Wf9@7UIt804b^?+64Tq#I=0g_{S* zmW_a;-O#N-OEj>epGMLVUHgAK zkzgbo3?oDys8}OgJ$M8K+LD3A1GlSMT&gYG<7f$_Ed=b=bsu@IVnTlO;5Z6=>o%C> z= zT|XW`+pJ6O{z(>yF0?l+p{7W+Ksd~nbld(0uS~1 ze6l&8VIgljoZkettsV`Hv#!)gV_u;81HCs404xK d$s3f$Vn~ywYgTc&u3H8|e|_kah!FfZzW{02$58+P delta 1411 zcmbtUZAep57(REqyAyLZt8=<{-9QV6LfNKPla_IuwiT9}MAix=x}-Od)|-dc|W+fdr?#j zIdQ2O^6*$9yrwFKK?nfIxVLJuvW?rqgVN(EDYw2h+zZ-ntk_%nJU0>^i3v{{^}^N5 zeUIhv*oL8!_iZ}|@RM-KuDdsb-v(C-o!%F%(cVd4-&2Fs=jk~8?1G5{Y1STn|J%Dl z7c?$SEv#&R)ZJy#n?9YcF1B@EaToUYM%13icfOoYcTRj3V`m-L-I`~wCObB2DY~xnY3iVKcg1iP~XtJ~NvP`0I_G!h!ck|F0-k4zh%}gv@t9pXBk^m=? z*W3ejSZgI|Dv|W4I@Ga5EBQcmm--YJCLh)Hx{>)Tnd3x`j^H+rIE$uf_rWS3OhQ34 zoP;j%#%n1oL};4g)i#;4%%&_808)xd5{0RoBx=Y${vz4Q%e!PWpn?M&qJWZ&ZfRi4 zv>Hx;kK*Qzc`y>!M|AMyPklbDn+_je2wy2?sFKoXlBFZY3%m@Iq}4#sOQJg7#UF+gZvjOmD57ozbAd4`$RxrU;|8#8e*>b|!F985J&EunE&d`YysY#jeC* z>};V)7Mf3849Zcyr^ddqo`ooQWwQYp?wkM%Ss8YJizla`x!j%CT)sW%-{INn$C#W+ zaD|E)iPy2&@5$NhueQ5uz1d|`q%BJQieO~430Cu8IJeOz@{~v#MUM~LTx=-5>81*5 zYwTMC%%e1=;D=^R>Hi4q=7#G1dCfBk{(&E2(Kx<_e{^6s*H-;bFncR=Ub8~!D6L&t ceh9AzqTmeps~QhTN9Z>>`vHeF0I)yr4{sECDF6Tf diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 342575da9..abc90629f 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -622,8 +622,10 @@ def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame: Notes ----- - Ignores canary releases - - Github tag is stored as `"v_tag"` - - npm tag is `"tag"` + - ``npm`` can accept either, but this endpoint returns without "v": + + {tag} + v{tag} """ req = urllib.request.Request( _NPM_METADATA_URL, headers={"Accept": "application/json"} @@ -631,13 +633,11 @@ def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame: with urllib.request.urlopen(req) as response: content: NpmPackageMetadataResponse = json.load(response) versions = [ - v["version"] for v in content["versions"] if _CANARY not in v["version"] + f"v{version}" + for v in content["versions"] + if (version := v["version"]) and _CANARY not in version ] - return ( - pl.DataFrame({"tag": versions}) - .pipe(_with_sem_ver) - .with_columns(v_tag=pl.concat_str(pl.lit("v"), pl.col("tag"))) - ) + return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) def _tag_from(s: str, /) -> str: From 1937f2b74df00d2a649ec52879e90ef2fe469cbc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:31:53 +0000 Subject: [PATCH 024/137] refactor: Move `_npm_metadata` into a class --- tools/vendor_datasets.py | 87 +++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index abc90629f..ed094d0c0 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -90,6 +90,11 @@ class GitHubUrl(NamedTuple): TREES: LiteralString +class NpmUrl(NamedTuple): + CDN: LiteralString + TAGS: LiteralString + + class GitHubTag(TypedDict): name: str node_id: str @@ -446,6 +451,8 @@ def __init__( *, write_schema: bool, base_url: LiteralString = "https://api.github.com/", + org: LiteralString = "vega", + package: LiteralString = "vega-datasets", ) -> None: # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced # that describes column types - in a non-binary format. @@ -456,7 +463,7 @@ def __init__( "tags": output_dir / f"{name_tags}.parquet", "trees": output_dir / f"{name_trees}.parquet", } - repo = f"{base_url}repos/vega/vega-datasets/" + repo = f"{base_url}repos/{org}/{package}/" self._url = GitHubUrl( BASE=base_url, RATE=f"{base_url}rate_limit", @@ -605,8 +612,10 @@ def _refresh_tags( return tags +_root_dir: Path = Path(__file__).parent + GitHub = _GitHub( - Path(__file__).parent / "_vega_datasets_data", + _root_dir / "_vega_datasets_data", name_trees="metadata_full", name_tags="tags", write_schema=True, @@ -615,29 +624,61 @@ def _refresh_tags( ####################################################################################### -def _npm_metadata(*args: WorkInProgress) -> pl.DataFrame: - """ - Request, parse npm tags metadata. +class _Npm: + def __init__( + self, + output_dir: Path, + name_tags: str, + *, + write_schema: bool, + jsdelivr: Literal["jsdelivr"] = "jsdelivr", + npm: Literal["npm"] = "npm", + package: LiteralString = "vega-datasets", + jsdelivr_version: LiteralString = "v1", + ) -> None: + self._write_schema: bool = write_schema + output_dir.mkdir(exist_ok=True) + self._paths: dict[Literal["tags"], Path] = { + "tags": output_dir / f"{name_tags}.parquet" + } + self._url: NpmUrl = NpmUrl( + CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", + TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", + ) - Notes - ----- - - Ignores canary releases - - ``npm`` can accept either, but this endpoint returns without "v": + @property + def url(self) -> NpmUrl: + return self._url - {tag} - v{tag} - """ - req = urllib.request.Request( - _NPM_METADATA_URL, headers={"Accept": "application/json"} - ) - with urllib.request.urlopen(req) as response: - content: NpmPackageMetadataResponse = json.load(response) - versions = [ - f"v{version}" - for v in content["versions"] - if (version := v["version"]) and _CANARY not in version - ] - return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) + def tags(self) -> pl.DataFrame: + """ + Request, parse tags from `Get package metadata`_. + + Notes + ----- + - Ignores canary releases + - ``npm`` can accept either, but this endpoint returns without "v": + + {tag} + v{tag} + + .. _Get package metadata: + https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package- + """ + req = urllib.request.Request( + self.url.TAGS, headers={"Accept": "application/json"} + ) + with urllib.request.urlopen(req) as response: + content: NpmPackageMetadataResponse = json.load(response) + versions = [ + f"v{tag}" + for v in content["versions"] + if (tag := v["version"]) and _CANARY not in tag + ] + return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) + + +Npm = _Npm(_root_dir / "_vega_datasets_data", name_tags="tags_npm", write_schema=True) def _tag_from(s: str, /) -> str: From 66fa6d15cd967a25752e35814a6c3f03ea771487 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:35:11 +0000 Subject: [PATCH 025/137] chore: Remove unused, add todo --- tools/vendor_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index ed094d0c0..048ff8771 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -70,7 +70,6 @@ """Query result scalar selection.""" _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" -_NPM_METADATA_URL = "https://data.jsdelivr.com/v1/packages/npm/vega-datasets" _SUB_DIR = "data" _SEM_VER_FIELDS: tuple[ Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] @@ -595,6 +594,7 @@ def _refresh_tags( print("Checking for new tags") prev = pl.scan_parquet(fp) curr_latest = self.tags(1) + # TODO: Needs a hook for `_npm_metadata()` if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): print(f"Already up-to-date {fp!s}") return prev.collect() From 21b2edd0ee1c55ab09e8a31535a3a15f5ab55720 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:19:24 +0000 Subject: [PATCH 026/137] feat: Adds `app` context for github<->npm --- tools/_vega_datasets_data/tags.parquet | Bin 6210 -> 6200 bytes tools/vendor_datasets.py | 222 ++++++++++++++----------- 2 files changed, 127 insertions(+), 95 deletions(-) diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/_vega_datasets_data/tags.parquet index dc0ff652ed261eebeed70ead42c0f7352ea4e8c3..1cd7b957b3ce87cfa8e80b05548f72c9133b5d1c 100644 GIT binary patch delta 3091 zcmc&%do)!07eB`ghGxbL&M`B_GmY06GhRLDW+?*0Aj+?jG;+I< zaMJ^IM-wU1NECW_Cz0~#Mfg#-x!ve@_|?rH-QW5>{`j4>&RTnY*Jq#a-fQpu*`ITh zS-})}u-CI7G*qMvje{JB3jqLNQf4{D>j0urfcgqNK{6cWMe2|4b!)WI9v%-uU1Xb9 zdoJz3-T!^(ON$pu#f$)J7CT?_L$8+4gpTf|)?ZFtjy6(+0LTa0N=Kp~WDxa4`gA@B zC?ElGK_#4{K6W{u>RLeO3^;fgxp9TQK4(HVz1YJuIUEq28&ouN4q?f*fD(aBLPhbn zKvqAW4lq&LVZ~U?V3L$~uox^Rg9!jg+T&EH6F)cYMM^jwz$_>ehnI*7 zQAQ$CT1itxEnl^(1Ly$SfW)}9&q5uBR|+%^*rB(dp7#Oje02Er{hr4w>KD=JaD#3^ zorj07%`3CetC1-sAujmijXf#%clV!P9{*vxX$iaiVE01%$H5T|thiVjxyx**aW6I7 zFgY-9C7x2*sW#Y1rE#~Uw%^IKy{RfJeZ&jtm zJ+G79#?I>fa6Tlrko<0wE8Y9j#Us{H0FboaHjyJeyECWNCsppXG_*2%$ZLmmiHN(Y zC@Q5cbEH}2#e@EdfS+x&Jnv*l7TXxc-yR12lQIj}*)MzFojck&wEz3nr|@j|Reyx$ z7*!c--uxrb;Qi9Rv4Z@v4H{Tu-Y;2^gaZ_-xQ2Y;8zvNcA?PyE`PC|d?<1$3!F_R# zS(6xTJ)HY#G5_%Mj@H6$=X5DMtC~_A|3AV5 zYUo=3LvzA4lJ9|M-0vHmo)UC_ZnO^ER#T@L80hSN%VDbcu7Y?BlW8=axvJ9bP3hE< zt#FI(uYSz_^R`Rb1LnlDqILZ|<4WO*oboVnuQEksx-$ON79A@)^y}KH@u$`gxFdvZ zORSGXgbyCOJ(~Zd!p!jY1N5Qa)aD<&A zjFpvSm)A70du0`Inl;O`FFFpM$v+R59eb#yWy%UzXV=d>95yZb`_-ZNV9NED(&09T@4dm)H5DXUd}82G{rCeKX2ay!G;Lscad`me=!R z5+VBv@jVP_t+-|Dx{=Wg|DK>^(Bd3%gkY0Qb_qJfx}zr1Y6bN_ssZSo8``KJ2h_|J z3>63pdq92t1fupR#>8A1NF@S*e_m0GACaN@@}Jd|MmT83^+Sf0_8N<7+Xv`XE|aLn#4&;xV>_jhn(otTR+ikwBu(_ z+O*-kX#U!cwsy#=DAOH(13TTR+o$*<#iv77A;=eFYOz1s%y1yl78>p3n#(VqtPij`D(nU)p<3d1Q1JEgbK~mahxaL3u4tx z#uZTnBOW*`bKHwkL{mNTP)Y<|ULG5TLb$H0D7U=#-h^4@_c~?_uWL62t9QIip4_s1 z*Co!0gQfv%x~LT5je&Owmh8sXb0<&j4qY2FJW)$8T)SWY?CB{xcl9S4s?YXFkNlB4 z7Sqzau~NJt8l_$gF{rv6_N2u@ziC%;TDp>P zYXfuTkH7WoLKU2+LU6Qxbhb`}KNSA2S zgMAH$e&4BbsJ_x`Vw2|J!`rA|mSfKUoAf{viT=OzVEwP@p+&U++XNw;Ll9x!Q+FYR zBtBvjYYCM3nIC0Z`%rAGv;5PiDNqknTQ6Y6vq{ckoxL?YR>^}>5if+kuTH>QT5A8k zB{7Z9qLY+rr!Cz`x+)%+Y|I?m5Vt6Hdi?jip%N7#6h`_-M}*9>$GMfTM3W{HYtMTJ za>Ycfg&`LNjX)-{us`p2#Zdm%u!vc30SH%sv_>%xtLub<{5TNvi-gga*@D1Cjgrcd zYb;gAah$Q|CsCoNi+t1OH4CCR2F_@SvQ_{%jzddvNGcr24-p>X2CT(0E~tpRecTCS zkW6seM*w?%yij(`-E@B3#nM7`tCX3uIe`Rk`y9wMG+d-W8(}$!o3SLZ8;*{|9*eWE zmW|Vk;J1o164!>K}wJk)s7=3 zru`8vWT$MV9qv^SKoy`2xRV z3my4lzg_qzWKfJJW(l}tD)|uYE{ggeiZQs$2_nG-_Ru1L)(y?^{?<;hgvV#l-S$v5 z2++pOzrz2rmF}Cvsm_Sc@ukoi6hKR2QRfu#8{I^q!+FjS3|Bis-vP8No4N3B*PITQ z&s^|v)}hO8j>BJ+*&Dt;<8X~VbHnJOuMYFVBZU4D!Vn=pQaIZkbBb<)L25Spkp8p) z0De9bt9ZIjO}uy{uLf*?|T1oUwRRPF>B0`PA;VJ-~=k29q*%~%x1SQ!9j H{sjI31)iZf delta 3230 zcmai%3p7-F7r@V%85(9VgLBMikVk`%NAe~ODvydNVm!tpkLwZTk}i`+5-E&VqDIM^ zyoxKYh%5=ohe95=yi*=UCB7NA+r8_4_xrxH);VXdb@tx>z5i?P-+!MPrDi4AeE%_J z2!LYHP!vKQqOwFzh?A5ALWyM@bSD(piUg93aCqjs%sitMrgZ9IlflNTGj_xQJ(+@? zrH%`cNw#0_?OvTwywOh&j9;h;i=UyVeQurPRxXhaJ#L?-(rE_ToQhV+$B*^<&xw>l zxym>ksLbh-qMSAWv}tj*{sdMgDm>A9d^}tA(X7>qf?kqL`V9@+7-0y)5_bZeL;~_G z%^kwRRCo{#_aPF$5wQR?EC!|GwgP6nLOIi&7C3o!03iYEym%&*gOs{&C0-bN+q|Uk z;O5XgFb&KAi3K_?lQ7jxeb4p?iOCs9H#mGzwl3aX#MrykO8lpb=I9vNH-)2c}6AlqhG@Hs@&19r}F@5hLu!ht4z90~+i6it4KuCP1t*z;@Z z6<_Mj&)ub@wWODirW1?WswHnkvt3PWYR+h)-C5ta|60YW?!rY*CZd z40oLDOL4)jTRcx0ZoO2qG=*wSRf((Zb3zVBI(qZ7Zk_kB2*ND;rS|42w##}p(&HIX zLKi8)qw=W6sjv2`RsCLF&D3e;+g~Ua2Czeb$H}j`r30}Cw4=;Jl5o_QyX&g_HxQVR zsQ7C)1!i2+w=H|k9IIxEhUmP0+iWOWJTLiVP|bRyHW zF3X{Z=lR`n^OU>uR~2>eyMAu|*_5uII%c*a;;|$OKlL?Pn$Ic4k2glb?kctlu_L-m zaoU87>cwG`ueihru65n=(>v`oTWk6EiCqsZvLNWT3RcUl8}s@+G|)1RWH^TgM$lVt zxCgKDq_@RTip0wbH#2rm=SdAQUQe{lUQfVH&}%qSnBH{P->&P67+0BBiqre}QW{X7 zQZ7l=K1)2W({^jmXkT>#zuY6q`G(*nNvw2o)$T;N`^i&(rPn1%Q@0t8V*7ac#dE%# zJ45Sl8^kZB(-UpYK0mUieC2zeZ2a)BLdCa`9{Jlw4y}783icXtwb@9BMunP;Q@B6k zmsL=nK|iN7HpQ3o#=l9w+MiZV59%ZEw?sbIu{0PRKYOY--SSgZSkFY}2Ju7QSh ztC8>ErDseEb>!0CT47gV)HObBX|9s1HQzh!_1J#~BaW7C9=!;dwl{2C2gsMcMGM zC|6|+RpEc<$0Aoh-N_bzQ3A*a9ODx{ZET{|zJt#*$8ShCfA*+rhLr5dfNpUw#qOmwQLUq+`(F=BQ5J){z7NHz3Y{hX6M9taa%0Mzl<2Ay{X#P?H#Ay zTqx+YpRSW;aU)MGjm?ex(8|gzavG^6&pj!;28Udfm24I%jJRH++kol9loy8k zw^F3>(L~=fEO2B`QDCQoKqbYFbT=7y$$qK4Ax zqhm~F(^wPFioP9SG=>o*>_})5kkJ4@y)tgW(@sDHl2fWDA_BznlSqPP0M4oDLO6h6 zWTMIR#=<&=IYuSye2OOSuQdt2xW)a^04* zLo2?!5v0o39w18l?argiXH`0Vp4a7=j+pDJeh5y(2x!{gvm9(eKNz=nRo8Wwd~^pn z)q(M!@@_oiZj|r~M%Y~Fj;V6yF4L1}QnRGxxwcM)n)K#Q@Ak{-dGr*Hk%%V|U52Cn z7ZM$ZJ)#(kh-7+LG#9IH2k?t&ViNb$m!4afeY+IY zku#F}X)sJ=Z=A<9Yem9hU&c-5VoJ=5Mj8Jy#z=+4k)i6dBFRcoeKVtOhYZ;*%;&$| zB|h{`T3Elhivd%w6T_#sY}U#5J-l@kioo4PH4O{m_`cW1&v??S~bqt_&x?_GjU{JU7l6QW*ZL z(X1A3rt{8Ri27Qe8u&sjSTlb+A?=#PntKZrGpwrmuiFonzD!8$jF#^>Kx$ijwyCIB zJ*;^6j`|6nf?u4w*Bl$?mclFj_&ey3d~PH?{o|l_mj|vh#HSGFu9#EZoh|p}eb0jr zvy#agenfx`0)b-n_V%r@ zfa`34o7)-3zyR;97GE-hmjuZ%jBx%g##@XuNM-O~6(p!2Ce)Z^Hsx5%4B zkPhs}QUI#mOfp_?VDK1M7E;U=9OFavKrE)1K@KdUFzT=hx>S&Zhb!CCMD}j-v33Hu z{=R;-!NpE4OQm^=WhG+SPWVeip1$8Bs^x&|5abm`gMctYpGTgRf)9^^0u@AZp_>ky z04V(?SpWx+{*ujU+F2Kmt!$=2cn~CKFrM&eb5kMQ))=Q|#V{9ywYGBFfjJ2P9IOO6 z^dTJpK>k=Cb2`AyF~H5+jdsRu%Tp2v7lY(sb)DbMxmcNrLQ3q+&hv`Nu`>I^2?P)! z)_BJ2dB*YYKCIwV6!1SKfsyLe#_R0e*RC-y=Kf5=xgeyD6{LZ{do5Q3O_2o>PxcNW3{5Oybnh^LED zi6r6`@-Tw*<6!vPNJ)@|mnaYn*J(o<067QxQ}G{95XMmfPMH;Lr~w4XU*vu$|JRD_ z-o-S42ylfil#2w&vC5<$yZ?{%D9i)b7_g!l)Q9u{av39AK#+?H|5+W|>0NAhsr%WV zm(>2^?LX>auZ|iZgGRA7hJ6vxzrWc}c#`d-Z_7uF=AS _GitHubQueryNamespace: def url(self) -> GitHubUrl: return self._url - def rate_limit(self) -> ParsedRateLimit: - return self.parse.rate_limit(self.req.rate_limit()) + def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: + limit = self.parse.rate_limit(self.req.rate_limit()) + if strict and limit["is_limited"]: + raise NotImplementedError(limit) + return limit - def tags(self, n_head: int | None, *, warn_lower: bool = False) -> pl.DataFrame: + def tags( + self, n_head: int | None = None, *, warn_lower: bool = False + ) -> pl.DataFrame: tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver) @@ -516,48 +521,65 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: ) return df.select(*sorted(df.columns)) - def refresh( - self, fp_tags: Path | None = None, fp_trees: Path | None = None - ) -> pl.DataFrame: + def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: """ Use known tags to discover and update missing trees metadata. Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. """ - rate_limit = self.rate_limit() - if rate_limit["is_limited"]: - raise NotImplementedError(rate_limit) - fp_tags = fp_tags or self._paths["tags"] - fp_trees = fp_trees or self._paths["trees"] - IS_AUTH = rate_limit["is_auth"] - UNAUTH_LIMIT = self.req._UNAUTH_TREES_LIMIT - - tags = ( - self._refresh_tags(fp_tags) - if IS_AUTH or rate_limit["remaining"] > UNAUTH_LIMIT - else pl.read_parquet(fp_tags) - ) - trees = pl.read_parquet(fp_trees) - - missing_trees = tags.join( + rate_limit = self.rate_limit(strict=True) + fp = self._paths["trees"] + trees = pl.read_parquet(fp) + missing_trees = gh_tags.join( trees.select(pl.col("tag").unique()), on="tag", how="anti" ) if missing_trees.is_empty(): - print(f"Already up-to-date {fp_trees!s}") + print(f"Already up-to-date {fp!s}") return trees else: - it = islice( - missing_trees.iter_rows(named=True), None if IS_AUTH else UNAUTH_LIMIT - ) + stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT + it = islice(missing_trees.iter_rows(named=True), stop) missing = cast("Iterator[ReParsedTag]", it) fresh_rows = self._trees_batched(missing) print( f"Finished collection.\n" - f"Writing {fresh_rows.height} new rows to {fp_trees!s}" + f"Writing {fresh_rows.height} new rows to {fp!s}" + ) + return pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver) + + def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: + limit = self.rate_limit(strict=True) + npm_tag_only = npm_tags.lazy().select("tag") + fp = self._paths["tags"] + if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: + return ( + pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect() + ) + elif not fp.exists(): + print(f"Initializing {fp!s}") + tags = ( + self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect() + ) + print(f"Collected {tags.height} new tags") + return tags + else: + print("Checking for new tags") + prev = pl.scan_parquet(fp) + latest = ( + self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() + ) + if latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): + print(f"Already up-to-date {fp!s}") + return prev.collect() + print(f"Refreshing {fp!s}") + prev_eager = prev.collect() + tags = ( + pl.concat((self.tags(), prev_eager), how="vertical") + .unique("sha") + .pipe(_sort_sem_ver) ) - refreshed = pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver) - _write_parquet(refreshed, fp_trees, write_schema=self._write_schema) - return refreshed + print(f"Collected {tags.height - prev_eager.height} new tags") + return tags def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: rate_limit = self.rate_limit() @@ -581,45 +603,6 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: dfs.append(self.trees(tag)) return pl.concat(dfs) - def _refresh_tags( - self, fp: Path | None = None, *, limit_new: int | None = None - ) -> pl.DataFrame: - n_new_tags: int = 0 - fp = fp or self._paths["tags"] - if not fp.exists(): - print(f"Initializing {fp!s}") - tags = self.tags(limit_new) - n_new_tags = tags.height - else: - print("Checking for new tags") - prev = pl.scan_parquet(fp) - curr_latest = self.tags(1) - # TODO: Needs a hook for `_npm_metadata()` - if curr_latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): - print(f"Already up-to-date {fp!s}") - return prev.collect() - else: - print(f"Refreshing {fp!s}") - prev_eager = prev.collect() - tags = ( - pl.concat((self.tags(limit_new), prev_eager), how="vertical") - .unique("sha") - .pipe(_sort_sem_ver) - ) - n_new_tags = tags.height - prev_eager.height - print(f"Collected {n_new_tags} new tags") - _write_parquet(tags, fp, write_schema=self._write_schema) - return tags - - -_root_dir: Path = Path(__file__).parent - -GitHub = _GitHub( - _root_dir / "_vega_datasets_data", - name_trees="metadata_full", - name_tags="tags", - write_schema=True, -) ####################################################################################### @@ -678,14 +661,85 @@ def tags(self) -> pl.DataFrame: return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) -Npm = _Npm(_root_dir / "_vega_datasets_data", name_tags="tags_npm", write_schema=True) +class Application: + """ + Top-level context. + + When ``write_schema``, addtional ``...-schema.json`` files are produced + that describes the metadata columns. + """ + + def __init__( + self, + output_dir: Path, + *, + write_schema: bool, + trees_gh: str = "metadata_full", + tags_gh: str = "tags", + tags_npm: str = "tags_npm", + kwds_gh: Mapping[str, Any] | None = None, + kwds_npm: Mapping[str, Any] | None = None, + ) -> None: + output_dir.mkdir(exist_ok=True) + kwds_gh = kwds_gh or {} + kwds_npm = kwds_npm or {} + self._write_schema: bool = write_schema + self._github: _GitHub = _GitHub( + output_dir, + name_tags=tags_gh, + name_trees=trees_gh, + write_schema=write_schema, + **kwds_gh, + ) + self._npm: _Npm = _Npm( + output_dir, + name_tags=tags_npm, + write_schema=write_schema, + **kwds_npm, + ) + + @property + def github(self) -> _GitHub: + return self._github + + @property + def npm(self) -> _Npm: + return self._npm + + def refresh(self) -> pl.DataFrame: + npm_tags = self.npm.tags() + self.write_parquet(npm_tags, self.npm._paths["tags"]) + + gh_tags = self.github.refresh_tags(npm_tags) + self.write_parquet(gh_tags, self.github._paths["tags"]) + + gh_trees = self.github.refresh_trees(gh_tags) + self.write_parquet(gh_trees, self.github._paths["trees"]) + return gh_trees + + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: + """Write ``frame`` to ``fp``, with some extra safety.""" + if not fp.exists(): + fp.touch() + df = frame.lazy().collect() + df.write_parquet(fp, compression="zstd", compression_level=17) + if self._write_schema: + schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()} + fp_schema = fp.with_name(f"{fp.stem}-schema.json") + if not fp_schema.exists(): + fp_schema.touch() + with fp_schema.open("w") as f: + json.dump(schema, f, indent=2) + + +app = Application(Path(__file__).parent / "_vega_datasets_data", write_schema=True) def _tag_from(s: str, /) -> str: # - Actual tag # - Trees url (using ref name) # - npm url (works w/o the `v` prefix) - trees_url = GitHub.url.TREES + trees_url = app.github.url.TREES if s.startswith("v"): return s elif s.startswith(trees_url): @@ -727,28 +781,6 @@ def _sort_sem_ver(frame: _Frame, /) -> _Frame: return frame.sort(_SEM_VER_FIELDS, descending=True) -def _write_parquet( - frame: pl.DataFrame | pl.LazyFrame, fp: Path, /, *, write_schema: bool -) -> None: - """ - Write ``frame`` to ``fp``, with some extra safety. - - When ``write_schema``, an addtional ``...-schema.json`` file is produced - that describes the metadata columns. - """ - if not fp.exists(): - fp.touch() - df = frame.lazy().collect() - df.write_parquet(fp, compression="zstd", compression_level=17) - if write_schema: - schema = {name: tp.__name__ for name, tp in df.schema.to_python().items()} - fp_schema = fp.with_name(f"{fp.stem}-schema.json") - if not fp_schema.exists(): - fp_schema.touch() - with fp_schema.open("w") as f: - json.dump(schema, f, indent=2) - - # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago @@ -960,7 +992,7 @@ def __call__( else: constraints["suffix"] = ext q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] - return GitHub.query.url_from(**q) + return app.github.query.url_from(**q) data = DataLoader() From 6527305cc5d82f54c529faafeceb90ca301b1e73 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:24:36 +0000 Subject: [PATCH 027/137] fix: Invalidate old trees --- .../_vega_datasets_data/metadata_full.parquet | Bin 21362 -> 20768 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata_full.parquet index 7a4e691cb414735738f276950d79e8c72c5f4b48..071e4bd6cf68fcc17952c5057858fa29399c9415 100644 GIT binary patch delta 9809 zcmdUVby!qi+wKG~bPpXvH_|22-6aBwGz!w)Lzi?bNH>BYsC0LiB1$)cG%|xQ9E|Vx zzUO@3`ObCzIDedV?Q8F~)?WA8&ph|D?maW>IVlC^7Xk@&WcB!=0B9OIbO*={-RDXs zB!{a3?t=jaXkY*Uu+IgQ1_A(Bs2fy=9#BUE1_YyE8pJ?!)UsBG*01hml|v;3IVp4$ zob2|V$nBjD(q&>^ZXg7;3}*)t!gSacFuIOMF&k`_=VhmSu*+;r=o&H3!T=ScIub z2}oo6YagT|H5r>EHGoa9o*Io11c#*zJr$#!C=(YTGLd}olIdeCOHsnyPa+AJEKEm% zT|X02APIvd;e}5VueApgdS{~ntldmyK%-qDO*oy@3V=PrwlNd^EnuMy2# zJYvafxA}fXr{r|Bz^G!Uwzp{8N~~_pllatGqB2Kd^ejI<=#uTUv9%2t-WFgC?9n>~ z^&&%~<$dPq$vVE?M}H}gu_C8~FSD9&Zr6|3s{y{3*^`m7{f^?T&`QO$UNb)hYcIc? zlE3jR?f6zh*OhvCC@B<|QW$_iNJ)WZYv*8N=49b$^T#P0(qt~{o z<49DkTBt?9ovP2gl3X z5xLTb5_+q%l3^GC>M??yT`=Na;>)>($w8tr(l=snsl)LulW+PrXaqu1x4UC;(yRgFD8EPpai(jU`^x7?{1gM!#HW}YlE0;92WbqB((*5;A z3|0eK?0*P^{C1YT`(OIR$lA$R3sWsL$m)0S{{Qd|{Qt3UTzvoWO*0wmpvbQX!u*fW zra%)-{YXL&G^jr(p#VxP_ye6V0QLbSx8Vf<*p(CN(XS8?b`wlS4kR?sFJok`iQ^9o z#njt79x$<1GFrUXxD~it1?j~9tsv@9mfQGyM*}G4`6vEe8#35o>kR84?ui z0C1_cA@0aW0}Wm^1F$N?eNKaIXRR@Pd!96@%Swja)qvh_G1=bl{`|2(A^4=zcRG$b+86AeNE6KAi_QXb^8JFV5sSmEXQY>q%*QZ5XZ1xg zCD?PuA$0^pv2T=+ITizo_E%ld-0l4S3d@oZ*%~S9H=t&!HKjS$Fd_6U`frzwQPRU% zz=7BR+i3~5Bre*Uny5&-}}h*qan z`W|U6cc(Yxi}iE2Q}IueThuHKSoHAO&&^(?Lrf2!D_E5%8`TYSU7jwfer9w4W_?4B zOFlGsHOWHf$&#?d_0h+;Ji7Jmu#ynm+s8>OgTPfbTkc!zk9WA~EGtY(;2Xp7mC=?t zzWhG94x3rS3(px4>m6O}*10HiY~ z1|~ci5INp7z^k}m&&YVOKaV1AIqVRrLrp_}`rc%6>f`#;h$2=b| zt95hKG_Dhtb?-nbdA(|hX)=kJZPt0)z$Dp&`~T8-40lgkTRWdWt004zEE2-{-s;yX zK;!lB@u0v+mDI$9L(!m>5afHL@IyQe^HrdLNdpT%mOJvq5=nF4v=$H__jDl$0{4Hbk2$cK`qej&j1rbs@e^NWr!;4hKcw%2_qgJ`j zS;K(yK5wt-)d;dh{Iz(Fc${xQlLPVrD(zOXhm?LVx-sL2x?S> z^r|!h2Nfc{(r~x!6)R2k#gZ)Gufmd2Qh+@yp8RoM5;oZvG}hc_^;0xR3|&PI&VCx# zYKh?jE!tNVo9on1yE*2^{LNLz*mj-n(%O47_;-uj*_2{gpvFz0rw1ZjiE3rS6_uQ} zz5R&FUe_9*FD!}y$=?U7-RHe5SbE*-shJ=Bex3c5lVH~QKXPLIpX4O@_La8%TS>bA zO-bBx^KX@;o~y|5my%Fb8r=1pmw;*hhmrsn=Fx<~XJBh^@H3uN1bPG%cDe^5Ehk>9 z21v-snEz1|w=9qk$UBSku}>Xee}Q=zU|U=COqn;-5wO66{UUe-ovJxWQr7UP`uYno zGVy{zAjM~w^hGd`*Heba*y3T5e+C`}2am9{%%g5-9j;j^Td(qHWq4hcd<w9vPtORVM8=ubPrY_7Eq0 z?MC!Cl_#C?+szH;^%$TR@CGnZ!0DhVUSERXtkR?)6wyqdX3(!KIH>p>7QxTrHvS{- z3YY)H^Jz&bM4sZ21IPQ1DKqu5u^c9eF}PT8T?#vLtou*)Mo5>p#v9_NGE>1wCOT2$ku6%C~+2@(pbigbHT zxf$TEP^R9<=BQ5`)ku>b(&_;}b|PRGeRz~8PJ@W!J5{Z0KY1PFh#9D6-AwPItuQI)whMpfc&?*2w9OLslhFu-xV zW_;g%IOz*aF<_4Lt6NeE+xgTdl^EL3@JbG*?;k1}=L=symDkMHH2XN2;%JBZts9A^K_w+QMQ4zD_ ztrRf&nBAPhi}8!ZgMTk%`+r&jg8x%V?cjx^cJiY0xhoZ!^2(SDL@Ho_nYvtcVf?xw z^ZA1+){marf*89Q^))3V;`sY1tbU!sazj&p@Y2B{*eM>L^|g#mor@9N?}YD9 z!>rtOoXo2x?8kS%aK8Gi6IH=E@wVU`E3WfTXx(_f__y+VN*lhv6QBJJzh5Ka{N|Y1 zksy66*V@XO?-orm3X`Ig%qyFm8F82+=#2UmM#i-o9*!Tw61sphM@gVB z;|las_zPE%{Oy*uimMBxK$9i>dtCoB8s2pSdH$n3%@|KW000C50L-cTcbMS85C9Ao z5)1%<0Id+!)%qRhafJ;OnQ=AM)Nw^ga1Ij(pNmqj*o`mDp#m_ZeGQa zn)<pTNp}3`%=6CU&Jwlxec7tauhz*_T?2r417tRw$D#^%T6Ke&db$8s8d+)b>`EdJdD1 zU!7c{DQ=?Q3eMf=Gx$~hku+26&x6U`{p!1Rd6TUm5}+^fbJ2aOi7rx!m2} zj3uyQ2f>&+ydU)j<$IQPSf+lHj^J1|T9kTqt(wP+VK7!4@v*Lhc*AJHGOmyd+F7{p zL9b}ivZU|x({?opO1x^dGs3!bHCe?mCQhh@{2Yg(JX<&i@}wY5p;{6rbStp|O0Kxq zAGHaYZt~TsPA05@F=Gd%bb5G3ugO9`x)jMOve7?+%f7gHnXmAQ!YhBrM5qy6?P@4^ zn1LFiGu7R6!>p-w&L%p^(Gt|$H;Ba2+9!F*5;P$YBmPe4iNDG8J*30RQ-#rw!jkbp zphTtTqPerdW`fO+)Irs=<2Z)X9IQv2i#4=@Q*CnX<=eXW?rJ{<(n~i$Wcqu#svK_7 zDastR@XW&EmXsv<7>pJxKuJVO_>rp6;4A-W+;0MV#o(QnE6>EB_pt6N;_&Vo<|rgr z@qE=bD2NuH!rXcvQ^~j*5fj8MdO_?-v(`%gUYA}xFaL77_1@28L%LECDmk;4J|E-~ ztT^Q&%?b}f^12Us8-8>=|6mqTe#%l-#0fT)Z{Ei ztvy{1YrP)ry9|?Mkv&H6YUi+?=jL~$IbI=|3xlWw#-ULR+(6D1o3$rBsHa&Fjo?bE zwrBQ93Z|bkWiLLgg5y6>eYR9Blr=t+#~&WhSni!D$?Naek#vz#zZ0FCc1>Sf9KZsn zW5mOAe&O_FjYnGH6rFu^CjJ~BmfHv8%V8k&HQZN1(omPcpFdGm#wDH+fwa0?^>Oue z4BfSB-oZbS+kLu|alCPMZC85FaG~0|u^c0cWE=1OvmPAVo9M|RM*ba%5LBL=Q+$We z8fH3CrgqeKZ0#3shL&+|G|kL%X(v_SzR{gQ1j{Nksd)?mW|{M|IB%wq!8RCKN{-LY zcehu|pHtN2x4IbUp&kYRBos-MMUT>__iU!Fl3cuwvCaz-cI9{1v%gqao+-};=?AcT zz|DMV&abv!c=4?^xq>3Lz)mR+m>k)oZ7pL4$PcMZ*ln6~8o=-)WdKlE!Ft-rF@@ERo) zeIDNEH}9e5;WMrul~52{R1g070$yTgv}_?DwR!C4_Tn`>3|01ewaLoazAj`SW^fb( zVTSq5-^OcpH5G!eIPvMlqDV&{X7ayg#gFQ(jvPc0TCb&<<(0I7ge&`r8@FI?`m-TSI>LOQY7Q5qER5T?jw7C(_7 zuwu(Z_~eeLHWuxK@3%>Q10s(DbMN;;@oOzv`{N#u<#}0doC|K!L;}1=)dPi&#E6W) z5~6Tq10K&O$oZUfc4SG?iot=(?_`ier7eA{49C?(#&n^$t$Mad2ghxcVrS&~vxzt) zajtXJ5W&hN9FG3U_2F9WJJ%_y|(sRF^dhP#pJOqt;COmN%r?7pC$uSAJe z9W3~a!sLm{F`_YS){C$7Gs@2~WhRhYTm=Ipi4sR~2AIpVbBR2f>W|1%=XNKaOt)C0 zxXPUs$I4Jko8FGZZ{Ft+6TWg>T{5Q2c7jC1TmT;*<`$>DHS{T=d~44nPf)?r?(1H* zD@ksvK4Xl0%KJ#cNdfMkr=XF~(=4GgArZyaxT%UHs-2HW3CX$Kpnm*``Xjrku`1zP z9YgDkG3L9ob8=q3jl&7)fI4j}MP*5cy1quFqqEdHKI~He z#5iic7)1jV=kN!IH*X}Z=GqDa6{b&{qlfTA<`|?WYMxExMR>r)3rjZLI?@Dt(Z5yi zRUZI3LUUivH5c0}Rqsq#SUg?ze3zZru$by8T3jxEZ6WJ%b^T2S{-Ds&mZNc+pJ?p5 z-trx3()(B?-;?M{5aXxP%1jUIB2HA_mbPLGA*EJ{FhfGcgYB1UF%C|}*wPbG_#x2E zlSxCjay^x!e0+8I9Inn{X7I_buE+Wb>DOrePj=_12z?N8(MJaL?q?Z7{ThWxqjCR! zMd+8F>@Rh)Q>rkhT}v);wq^2cAfMO@`Ax;$9F@A#arEhGa7}zpefl~3%28)(ulO!a z$ed@UqTO*^E`6&Fop1(TiT^kaAR~UbJ0vV)?Rt7X`Jp9zIWV0`kD@@1e&+?mB)%92G8NN-$))NBoN2qlvS0%DO9RN(}SJ-FIF3k`VD5 z9p5K((eGcwGbjXJ*|P`nw6WdrvugO$CUJaEJ2N+3Mo z>{8u+s;&ELM7@yDKbJkn=f=%`=qoxjx5(saHqV87H8Ay#5HQWpL%eh%OCLS8p{xp5 z*ceOqaU#A*%r0Y=?0nrar{XVxD3EfxaJ-Thcq=)Qb&owXY$1FL?JgZ39sZ6-L~RgC z^3f617F~S;b;pecKi?x5oq*XC z&;!wuqCqm{49~=9zr1GE;$DzzOxLE$3U4;}P?5gLT90d}yUMCk6jR&HPPlbca*2^s zOVfD%fhQJaO0ctMv3bF12O!-@mFGF=S(h^v(vW zzUsYdez17An{_sI6!%wajK0-R+ z)ZTb?VcPP6a)~0yHcfGT8BiZ912V<2b>t z=9<2)%Y2~?Z{&`(^$c-JXu0D1@q>XKpXuf&t{5}zb85@5hD++|k|!j)uXg(iz7UJ# z?kAmUg%Jv0vD&<)6dqPwS9ugf@^F@m@Vh*}>`lCg4(c8NwFzmH1}G&nkOXtR7J+}& z3JIqs!Y?*&;#o$EU!np~0;mALel$LbW>!sMR-F#Hy}Wn)`s-M?z-CsJNdf$6IeyJh z!5sYi`|{1NvA4(XQwlT3@9~bmT0?)k{`7q9Hj?4fVMoI+G7m=FV&SRGQC8*kV|(?! zXiVgkjwN^`I!Y@(AWYCQELaAHIg53LbqU!OYR_n0-l#9Dg{*xqBHUdoc-N%HElD-j z5*1F@vof(y+4Hkt>gPt|s)$&-=4m4F;q_X!%44_d43l!UWgO$1iqAjAuH{|wqI$H2 z&?Xwjk`V~!oo_+xZg6VKB>^=`#3M~*IyQ-~E(m4G8`8k#=r`+S8zXB!>?v4X749Sl z^tp@6nAG?uu)e7s=|j88&ZKBxsKnt*AC6LJ1O#rV-ao*{s07=?KMWHwB)P}J-$vAI zrAje0@rs0hZKsWK<(CbvC zIm;xbKxv-J`?#+-CL!3X>r?X5h$7_%ocZz0HNXkcqIm8EOTF??1cRGPxR`k?ni|m| z|BU)rd=zpS2!osU6m6ONN?y{ES|ZW#(3q7>0R9m{L{CABR^6PHB1AVEuDcgK$^ zgw-@1L6cd66UD{bI`j*v`cwZajuX+71)`eA6&x@@R}=xl2;(N*iIRz&l_BGBxBd(U zm_P7&lnPJFwSD=80jDNYzH=a9y9{?^le%>kgh0hb35dy?mkH#bJKNT*O^w~zv~$H< zwFxkpuA7>;{WGe_4tdUZ?sg?A-ham9M{71qFM z6S1|)-)u?sg~h=K21I+mOCZyH$aAC^TD0=ia;U~h%<%8w{h+#s)jg@|KGr2VgpV$H zmjtcaAAdRI`vOkl3|a-lQ$ z2CF-;?k75kBjJW!T8H$V?VM;w+^Ylv2bp`~(8BBQ)Rw#cC-)0|QxHqS_`;HK)-4(_ zH03?{>TelcGhuUiNF9%*d}jY7&kg$YIJOq)(>c!_j%1pS^A~9jd=qaG4+!(sWZT$t z{Q@>koI9E4Vcy*1>1V+nv5#q+W3{;t$275@j5-fYXeV`xq!KXCqhEYY7AhlPaRS>- z4}i^js%_hfVPY=shX|{7Svem#C$YQ{AKeN4WAeN>ZT7>LemZIyW(-%@9nT?a>r)(? z+4Hu?J6ZFl%lLUD0YVuj2m_Dc&+v)I8|!!(QtKXA?Gz zoH?iPh0rmO!FmcXiY9)M=o3}UOwC1#I zn4Ia_ZxDIy)~>#?Ox}^Z(|2)a7h^{-NNNyj#KL;~Y@EGkgy`{zqc)5{x5Mn*o)m9E z=G4s9&C}yDvc1mNPw~q@l8l>1E9o05Z)M1(D4r3R5(|HZj8H03re;y^=mhao@D8W? zHb(@Tf*!K0$$gWnr~d(5&*;$$NTm3&_>zi<^(DTvA)lN&XF3En8cRQL$<(pqJ2SIE z-o4wGXpMtuJvCC7vxaBf!d`tgHyK?#Cm$-8Z15AzPz?t^xm}&JIQ4{;h}0{Hxxe z56vR@+uv>tMWO$#zjlOlhl`L)1cdI8z*&`!tX2`vSJV` z=D($Nkd@|vpk!q286YSlfOU(S=AT@L|1w!6RTKhqCTqRLP>`+MVw3=^y>vAHm#HLV zv%m4BnZKX8BK)ss{_pZD$l?3HA%nLN5ddqb=Kl)$KcyYqW*`6nWa0V0>_+{WWYm)8 zzu7w1{GGTEd0KHx5dc6TfvNes^6NyF^Lv_xED%%*z}jm6FI)c0z28)RO;WCg-;k4A zhyZ{U;ruVif8Op)ouVNlCL8??D1HHgiUC;9%7HZh)yCX7|HkbwDzb&!VWr?(`cgr^ mX8u2fUjilSH!AcM2wDsRkfn9d-ezdAVv>!<)LUp1q5Ti6Q?g$G delta 10242 zcmeHt_g7TUvhNH744EO!kYUIrVI zFiZSET2KS`-}OO%psP95>*=szSOC7b#&WPGN8Y;TE9(LK)gM8s_WO7O)Fjci4vB`m z>GA8Wa-K5gyBeFmPgy_4vKJ+OIwvdWT@C#uq>R*4L4m&y^u14by`3cED z+mAu=9J;LN%@8V%BNwSo9hl`-=S~0Y=!YTF#3vpR(0FZaS>zg(qFL9_o|Sux9K-m1 zfetER?Dkn1QAvX)gR zLuzq3WvPPb$r=v^(0x^=8e75?GZHdjF*rxAmP1n|g}cq8&hNb{YMjYjh%aJCj}uR8 z1gZ5AcCnl zzA~XLDX8nqh|T}M@%~?or%UlK<6R&)#wD+fCkR%pPo|&-%e&w@22jjDzJ{x7awv!C zGcA$Ax=7=zo)Ua}Ab%PBvh|bI8%yjlVP-QSTe66m1MoHQT_Z~~>F z`YXrZ2y?K#t~wOSGj^DjXn3>1!Nk9m6;8olWcLek9`+U;Y+oxfLUypFX4OMlJKvgB zG!f$yP;W=_Ro=t>AyOoDJT-eS7cAMV=nORoo(N?CsLLsKn0&}rUSd7H&7pFVkhd&= zvk%%4GHTjmu?lDJ#08t`&i6A*!<<%^wwY~11)=ua$_IFFL$SwJ@2cV}kp_=PW^d;p zm-HcqDd?%L`J$TOCms&?&~T@w{L$s1CH#Hlx>x7dtHO2I#R?2{5meL2aVVQ24h@r6iA zH8MtwrC0p7P;zG|-X7GK3K>z#NlN*x2H#2U+K7C0E|IeEwUN1`0cqMMe( z6S%G(+tYDI61z)3*evJWDy8KiWIAH{)b>LBND`*W?J^QBZKD|XC}5<#YV?x{LdaL^ z4l-@M`;4AW;?>lt?5VXNA5utN4D}C>{6`$+!56XAB489QU#qr6e15Gi;Z`JCtQZy* zmz9A9qXB<)dm=CIHHXc~Dj+4mI&DaR5k$3AZrq`P-RH!9pXmjwa$cGc?1>4`u^5eh_J(U|?*{y9Z@HBh_+*)pg3BPHgGCF1)hv1;Aq={O|h z>>aP8JUxyzITL|9JLl7K})h<{0%voAHbjyc?v0 zM?3z;ElJ)?pvVIPYKR8KY6uZ=!#uNp9T+DxPN7vXST6O*#DcC76K-5D=mdtw5DLE;%-=kFssMt&+v?#r2VhnKX&B;;E0>Rg0DTj?;R_ zs;7?G;*D|drTLg2c;fGW^qYv7MSFSoXQ!bKO;EI>r4%Mc!Wa(!;9$tONzK1T1PBEdfw>>Ba!LJ-k zKp+4F!{Xw+{?S-qDu1ng>D0l^rv^60?Bw^1`d&0k@&>+FQ^~0$z(D!7Qak*#2D5H` zoMP4KzIWdsVR}CMuu$gfcA{(SLhZzooyppZjP()P5>HFdYI)z=tLQrc&ZAYM+m3@t zv-zq4pD5=%l2h1!OiXFUFibz^X8r!UqG?96u+m=TR`xB6kK-xMCK)*fQQi@nH~XLy zPYG50J*K7!Q=bY*+p(j#2jc@K9g-c{G?ldicSg2o8zDVEhjal90Z>8pmX@C z@kzFKev=5glOKT|=f@WE#4R%8k)2h?RegsADYz{RA`=c7NbXlOHMQsHS`J|`(2|jf z7k)==^Xn3p8-j)kFakJuDPB{eCK{mzIwoeW#V35P#Kr57HlBJemM_K}M>l7A5~lT{ zDtN}~3Vz^Y{?e|9=zr+9x)#g7n!ld@i|P8V+yRqFz%D)Lv@|ly?)Gr^&3ENd>=-iQ zqasa@Qz;dBVrdfdb@n;*85ZP00dsfF_RN11#w;u>*u@L~MCr3m{}~3Z*M2U97mEDb z&i=z4f+3ANDo^!e=foyLwi?7Yp#Qw8HpfMa{G7j4?p*%u1AXTc91~;QCqGj*h7^1NhQ+B2 zg!F7+kXO%XUgY7W@ah;QuLSYY19owvudL5_-zMP$P8YiOnC@#M*G54XXC3C|S|7hYKcw<{-A-l3^R zlkgLVZN@_CK6S;$8h(Y8FhNHqA3jk*qr*9EpGR=Dq=E?&`(GPqbdpu{bhS9VT#-EN z-Ry;=MxIj$7Q^CvhBePP+9;==`m(q7`O=cS!e-+R2GPFIk!8F)KFzwOTZZ2-B@`8q zlr)q_lRHS705=SY+wH#te3zVoN@6jYTGCzK4E~2MYZ%d^k znXADfK)c}D&qOsMnS8egNMFU<_j|K^e?sodMHjB&;MkG(`=YR!eoeS%>d4cfsN6V? zHZTgL(FTbgl8&BeWRVJ3X|}t#?Zu?Ww2k!oG79&Vb!4K!jDFY!giG7<1TKxobR%pO z&b^*!oOvx5JC-64eLHE$f867QQKl83IFUuvLif`&M4$RMo=saZC!<1*9vz-9@E?ig zhLm(?Wt2G2_HM-tQI)XO9q^@#E4WE-wv_qeR?0T{L_MghUN)o@pQq$+E;ex|Q>$uplAnlV*;2ic`GSKWwuhcWrp9zmo65mqq6%B$*5qwK$p)8#M1E=qQ~RsnOlhA~&kryU@?r>B!qe$*;X#rstj3A-4a7;mIlQY=yljd;c zWnOY#X)Dk=EBS8waeA|+R9yshq+eyZDLxP~1uUBfqH6;vpOLy*k%$@IQ1m5HR(6|J zIjNMQ_4L^0J{P1<$rHBCyhp!gR7zRRRn3tY)44(yjjj`g0P&n_XgXME+N$;i`Z?2zw)TDb7&&Pl$(nCTZ(7mw_rNk_pXaGq^AnfBYQgjM zzMk$v=#f#cLfGY-F+*C&0qF$gTw%T5(vQK%lTJkP4+7gID9 zvJ~x;ga;LR=%MTgQ}XDW_K^$r1!W(Jza4_URtv_Y`4df8xy_13NmO-MvUUmFP@tdv zsiD-EE7DdAY}rmamt+jd4pF^3h>EA~#*5A{PB75yuFsK?O)SS&xbZoU|7MaD1HiE< zI$xGDq;;k7bvM@OiUCVcj`lgDDy|daQu!#)xUGn3Q8tI<<}hgvxL`WpP-Dts#W#6x zDq2xT31rUg7j*CK<2?7oeW7NB(MTiTV4uv<;1VV)c2lXFc8rbWrsSrNJ3JN4jL|__ zi2#SFBbeHd)M8A@VXVsH_70s#B5+>v(PuhMx7NzIZH9ZUOD$1Ar)QCl*cwP=v5B16=xB$TIN1PNR zc^xgwxB)EQ7*(}6h1q@Uw=MeoDox9HY2VHA0PJxi+^*vwIFtZH-!>kB`C z>ET{>q2;%bduN@ijFfu;KUn<(-z|IAZ@ulR$&nW1IkH=ERoTg`Nq&@|h>aYWWL{4M-tqV4+8*(JqO5bzXL8Oo;3eOtQ`A4c z=?2iDg4iVrjkBDS~P-Nn>YOU-nnMlCIyyo^5nFI>>M`jYP!t$Ju{FFX00Xy~fZ@lZ_he z*Rll)SQ0IIB9=CJ0o_B*w?{swr3Jg;*r`33(X7J?60psPdhz`(4GboB$G0a8aX(w$zt|mu=uC#1E?r#83-evnzT^KO*>TFQdWr25#)mpwfF4KwB zmw32kyZPrug?0w-Ju?B|leV&s&Vi$mr*OFaP1;{}6jczaD{@n_9TgoRc&0 zl#4x^WJ0j@Emp(*eb{FoJm^Z(q3FyJC*5h zB6`X2KE-0DzqgCgV7aII^A5itd(xL&#-z@4Q9KdYBwICJy&!l15W|tDqjt=%JE3k$ z?*HOgFo%3#@g3{Ic^+x6?(?X=W80_NQ5oaeM3*0H4J7TFonvb&6*Yj)ne7{RC3r+7Bf=C!> zdFi#sdh*B(me6MT?Y1{eQK`?&fo#+4^nSbmjVUGdK z+Wj8$NO*k`QNx$^Xsgby%v`$N73u!dsM#Dh#SUxn53sQ}z`gy)o6M&aHeIP3Aw+fh z{$_K{Gtb-{m+>!A`}Jh`76%SNa#IIIVlm{gDoM0Ajt2`kpsHs!9CluTxSX{r61(F5 zPHEJL>b}PWdBaRGi9^tYDC1myC++(%26@ISR8Vq?0ofy{Xx-b9r*9Neq}-I(fo@N$ z4hmI7jcWx67`xHP-JCI+yx2?8QFeZNzx=1xN?M#Qk`a}&)9m>wK>iI0{GrGK{DOae#*t$7JwVhv1_5=NT zKQg6*jIjxIC`Ps+OV)4>UL)wCc!;y zgadV3>yL_-$40uNSj+~~qOHp>fN1_7?W!4)3@WV&H$^W-^7+o=WMK(phn|wHf-+IL zKK55{09O_83)1WfeR}iifHguvHZyQHm5waKIh6patbkJj!$yy&Lei&=(ww90k= zed4Gmy7Oe2L$m0#ri%|&x%cW!J*S4c$$Jjbi!meJ`DV3z%;NyD+(74B`DKfpGle{g zQR&8id30t!Jb(1Ku!QOb*+rPOqjE7Kx1_QMAj8ON?zP*I)i_U8k%Eo`tM`O`bxk%@ zi}uX#TybXYLf+5a2W#I)p0wM=T1#4(V$Q1;=7=7q=f`|I@O#^fJ$mtzc z)Rk7KdS2HU0dB7>mQI|a#&30w-S`&K^< zWNebXc`6LLKf00JaJTc|)B7B-B-Pcoc{w-aJ_)`lWp8b>2wmT^AZpKZm|?^dOdu{i zC9#)b3im;(+*%~c6lS%u9g@=4V!IFOq9(`PJcLpeW z`2rXl$DD#MQJ^U_8!T8*a@Ei)(NXMuWR?D3`VWC$K8BySPs7f z?+446a(O4+)Sb-UtIdB#%vi1Isa1*OEMQ<)##POSRHY`$FjcENM2Gu;#aC9K)GWhz z*3l_ojVt6ctA%`?oD^85@IB_6!PIK8(`44Y@B_|q6I3{r5=CV=u!~o%&)$G4qJ@`G>-doM#ihU-@9)2h5>&Y`*qcvlXLDC5q)xGi9Jm@L zN~!x41Sl3e(2Rfts8sH{wUQsO{D30Wvx;%#GMDnYdGOm#O{XP=+NzF|dS zJPAWzGH;A@Z~Y$IFwV7zV{7wH6#VvsWFaYo=!#p5iqGw{&V7+f`woCOy3!#$lxA>x z|LJ|z;cueva_n>bKW}VdjZo4Pht}A>G7i=>UXNH0T{tP?%H!N^>}Bce1ZyU%A&WN-EX>;ax1Cn{wN<1Q}Q z%s#PvCF-%by-O#L7-K3KGPz=20DTJ4PKrwEmMQL0Yhs~)#qhpKZXU=1AKmfoiZvk! z`Z3&#fvH*A&v%IdX)>m!8{pNhJo!>=Uu5#O~Hxd@9viy^$!6vLf0q1JF zuLS{CIZ~#|_^y)x5W^Z`iqR-zTYLRP`FuR&Y83xT-HN+@H!Uqyt2=lPcz56Zu=e6f zAz5`(d&-dMF=IebS>M`5layX?HTB&bfu2QT*Z1=lLZ=WjmM-bWyxswm{Rd*wV#-Dq z1{N`WWcWWbHf;NVFjcw^$k_sTyg^Fy&O>5ZS&P^~i`yB1a3~ue>HEzU7l12=x^W}q zcxa!kTEvIYUAS5X?(c6ot`l;?v_#EvOo46*1qFFGYn??Z~usaF|T0 zexEU65=;st#t(CQSq}ov8ed9_AL@}$$_>2TDCsBs;3vG(Ho20Q|+u{@UXQQfU-rT%8KGqwfVbs+wr57B!yDEoaIK zyJ(}B@5&iP`}!s$F0<3|L0C0N7QcMPIC_#OWlKQmA8UgyqOh>{IH#@GTXHOg(E6z? zjK^!q_`?TIW)el(0rBA!g)L`aX`K$j<-j2Jdc-jp(`lhk10}G6#2i>SvMSncDPz)^ z-ljs;{lEcIISd$lUYm!vW$U+aS0jS;^ITsI(G_dSVKs5__!WGPzMePKP07JpUpc-$ z44B2)Y`k^)*IvOMYu zwu7z}IAGyTs*{<+y*u>O&vBLt_)J?9yR%Q%%O38u>CaCNjD026Wm$jAb{fK{_d$Fr ziT#V=zIA+CTYjW*cF+?i4=z~k9g{7|B;d+xJNM)Gyow*(yMk_>#3E%m-dG1X`aEeA;1SY zQd?p_-OeMc$3j1&kF)6K=nCrN8y^45gW6cAT)HhP81H}yU!Ubxw?6qxj1NYGaEZ@H6mYlNWb0TXhV3T z68AIYavRohGjK)7Kf20RB`$xp+x5W(kLPFZxP@pBH{d5lZyync!NHVN=QTias`LGm zigAD%Cj<3C6hD(Hw33tgjO}Hbm11&x$xEvKuELAO0P$wfyz;7H`I)q9p30)8!l`O= z*o2D6q%Idg+_o5XDJU*VR74>)X{R_9&|P63*L7aI+J8b>z+^yWs%m{B>A~xj2qFOs ze5;vVjq0eV!`!uu_^8q-%Gbb`9TXHvm&(ZSrHJ_ngm`hlT$S8>`r~F<+7-zgrOIVr z(`AV5WSokPcrQh~eA#hPa%}%CdQG9bXY+n*K?g?Gl1;Z7q=t)@?`LCG;MH_S_H71U z;VC7hKEAkhX9DXY*kA0?zOd+Rzb1EVyZ#7Qw#0JRI81!h#x}txqNr#9c(c69bLl7J z(@xaLQ{aB`lCnuJeoP^+EY80z=5n#l54S6i3v*kZKIXn^LMT&F&=puXdG;s<`+Dn_ z3wL|8`|WKo?;3xb3iTn0vL}rmlk9o@P|Ho>id?8JLpED=@X;pMbux+_bX^Js{gsPy zadZBYh>9GcL-gZ1adg#yWzq7yarJp>YFM@@B#7($3(fVvUIzF)_;0_Q8{R?uw;!+c zpZ=90JO%c5JoF*F9QSWO-wi&D^|${u3k*La|NC$~7rfsIt*QAQ>$>QQUJJ>7ojH!j zMHFLk;QdM*gTd6l>MRh0DmYh_<0(0f{Nf?vx16w07UQB<(xieK8cuURBO z9K$;FXeA2Dzr{3v>Oo*A1b_K&row{XOv!fi|JT9<7^361$n>>{Jc#4DGr9vsiMpQp zf60|2rhkj17h=IhKpfj1^#9Hz{*TFFDzUxa{QcMbQXr0%O7MRK{@2vwh!`UJI!#H7 z2rj~cmqM;>_zdfp=>I8D{u3v(h{E53rKMQ#Vk{6Mx0~s@q>cj*v6oU$dW#JEzW_M8 BVU_>@ From 336eeca4d273ae756b57b234601f681ff30dcd7d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:30:21 +0000 Subject: [PATCH 028/137] chore: Remove early test files# --- tools/_vega_datasets_data/metadata-schema.json | 12 ------------ tools/_vega_datasets_data/metadata.parquet | Bin 9100 -> 0 bytes .../metadata_v2.5.4-v2.9.0.parquet | Bin 11354 -> 0 bytes 3 files changed, 12 deletions(-) delete mode 100644 tools/_vega_datasets_data/metadata-schema.json delete mode 100644 tools/_vega_datasets_data/metadata.parquet delete mode 100644 tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/_vega_datasets_data/metadata-schema.json deleted file mode 100644 index 2b5b9d955..000000000 --- a/tools/_vega_datasets_data/metadata-schema.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "ext_supported": "bool", - "file_name": "str", - "name_collision": "bool", - "name_js": "str", - "name_py": "str", - "size": "int", - "suffix": "str", - "tag": "str", - "url_github": "str", - "url_npm": "str" -} \ No newline at end of file diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/_vega_datasets_data/metadata.parquet deleted file mode 100644 index 1ab0fb17143528da9cd460e84a0fb18a9f1d5b73..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9100 zcmds7c|4SB`+o*87`tc8ls(4IkZnYaonv3JhMBP>%V0=J*=eyCB9e}>CKM+vgwlc{ zOS@2(N>SDp^?RN{r*qDG&Zpno`^WqH&6wxjuj{(6`+ME@bzct~qMb4bfD)j(JeZ9D znc=UXIv|ZeU=RTGk#1007Uj(X07j^gjX#F! zSh$Zjh(c#0(C+A998&vtG7yJkG+4_<=K-woKt=h7P^jKMG(`$+CkCkj6%|8*{Z_ep zB``(s186Z%oM_e~0JZ;lG_}Ayo4ww_DYyX1BrSmm3OWG>;-&{_JTZmI2@DQ^Ghhb@ z_GtLFo?Gv4ul1}qf}|dl^N~4szAnaIp(>G-WGq?xak=pQhw41+o5cb1TC>4Pi+ldn zm+PlC$Uc@#rY?$x`Mjg`Yv5 z6{4W49DY>iPJ+ih)$Fu^9r4FW`W3V@>M7QYlu2#cI?_oq6t@>j^+}pQ7jc|7>vM^T zOlgzn!6?V~mYf+t(mTE4-ugR;v99YQu|HMp|4H88^*2?cf?@mfWEU z@>u^yafQJrpJg|4NR-T+Bnv7>N0i%bX;#9VKhS*Y>N8i%w@=vmsEr((?uR|QR5O9S zuGHkN$akkNd7d(wDi)Mn$5-8AU)8RuX_P&+6+R z=;Iko3i4UO`#2h%$&Twnz>xo2R6|C912zA_%=q%ZnHdr`0)Uax3i_?u87IunlR^y) z^r!h#gI5@PphIwj9DlU5VQz2%`A17bu2~v}3AVB|3&S&}@c_BvBndrQSUW!LY~#uP z9Q{Ts?qBvOc(U|q<+sf40`X;~`IT==LRp*uSHK>4G~Bv={e8{C>2}}QY|G)a>4_%J z;=r8c$Ee)dvoW>`RSqQPY5QZ(m@>L|q~}g84iG-*y`gaH^LPtyZDuEgQVsa*pFnN&&60_DkTv3H z7G*uFH7+@t{>p8iO@6>!v<-G6CGFBJukJQv`Z0lE^c>S(^Ln6j6)F%Ibu5f&}zHhqK&$Uy;CJiFAd)E|4w`^5ipU~!Gg zh9JkIRbpE>9&m`sJ*@}sOH>d%$VgLVL#d^fe=wu#W zzWa;&;hxFwt~c39&`sQ?gST% z#lOG}f^cj8lQl0thFevP7| z(hhjk(~8C3SBFvA_b*Xt_#G;9AXLgUomfytiXm0F!Qa54n!%uc!=g-{QC+=U{Q8m3_%9gOr2s-i_R2cNGxXdhFJ2op8J7kD?!_A;nk* zD94-hN>tvN8p>v~j-60BVEV=Ca5mzG!=Zqz5%UKEPYYk9U3e&u+uzqA-dyIb4vP46txCpA^= zEv{}WiRnf+yZ3ry7?c?hCB2TH~2sn)%_?heQv6j8zF zTi~1xTloE4m7{L@yMJX-Cltp8Tv21hax;}N15#oomAp+$>xzwm-`G<3BWR{T zTq5EYGjP-_iG!nv_lY4OopRjfc{azsIAMsvV~Gt98X06G9r$lCO8|(}0Nx@opj8sV zCkhzY#V{KPB0jAHZkcl;g@9DTM%EEc1SHE|zyV*yaK;_C9s9b?J_Xz%#2_K29&h1G6M|fQh$op0rv0cDdh33gUdT;)L>>fLkbI{M$((Y zX*;3)!V2`FFAVpZx5%UjHDH)mI`oO$f+8Lg5^DdWkjyJhnALPmOnBfbY#=B4n2wQVR14q0*Fg`uLIP4+U>hIEm&HPE&&V zD3JrHWSWw%uLeoOM_q$JQC0U<@mBUBYpPKw1Qj(EWio+)_f;lokklv|B+O6nz~eRW z1T_L#1EPi}sA{N@NhCEqM4h0iqKYT^Xe#4X)O|EmR#oIBB&eneL^Z`?gkoiKI5*+< zO)XX{@3-D5;%lmJcOiQ!C>x`OsYm1>c0u3qOJdjC&H4ni@IHS1T|%3mTCvMLmoPAl zk!Gu5pW1tpR2W5Ryfds9aed%;Ubc`x+oTYSQQ0^un5RCb<7}_tP2tzpPv>%?i8`B= z$TjO3X9+hNxeHcW|D)%X@FI3Ly+&8ljicYW{Y_n%! zt>@ihW!oLIdwOrY2Eh*PgfZKC>fA<=!vPQOeX!FGOiqi$Xb5{O4WIGPVdi5+5SsJa z64O$DapI9X|9U4cj}tE+`y-(j%1Z>p-rcvw#PThS+=1#saM$R zH=l}V%jZdU?oPq9mN2?XU|Y^`iej-xhS>|h zD9c$>dzC7Ig&g-Sd4@rs<;>nDM?r|GwE3OMWJS}|VSO$~d z`Y%f^#$>f$=Q165WCM)M`I1Ak4Ju6TRFMxhxXCpQw!h48Rl7OnA&m2wyW;VA_akro z`@lA-ea+!!@;q!$AGAnSjT!IeOyb!<8tT}yD4kz&kDA)&?XKYC%iGe{cA}*4X-8eo z&`p*AA9mih?u3d&DPNtC#l%7+{}F$uelj5?xXb9x^5ssO0V|`7Glw2J%2n4XepH+G z8mbKH5M|3~{-o)+uIXrK7x{!`!}`ZfLzl-yhUV_x>$_r23>SRN?s7P{eN#pWV|(vM z9M#!^!&pX`^Ou7npXPcVbLz%_>9jDY7Aigq}T?m@)5(+is#5Jy0>3X z&kIHOWnMp%vq&(PxX|Gz_3V`!!y<@2mSoRH{<2I(qcIN=zux`of!@&WCHvuN-k9oX zFS(0Orz!ZRX0cGWr#pC@uDs86_vszkJZku?|E;B`&TTPy8Mc%Jx%?O735yu5L%FPq z<~wX2B;OvtJ9*wilJ#O8!r;;i66L;I%xT?7k(c<;Wl`OMlZ-eap4-^UM|HXCzY_%`Vh-%zIBxC@%}2iHf=sK09lRpAYpXa_9oHn+n+Lu>_ij7<6wDlNVdy8RxvOl>$MM_po!uLr(?_#G{=)rpP8;eB& zc*j=9>d8KKi!|~toW|Ii0DSM)n!=&&9?l!PUd6DnFC;x!$KGn-J5}kZ`ob-s`1$L% zyl>?0Jl~%BIWCNZo7e9gmT0)|iF)>Br20mSY^9kIj-^}m`QuzY>>*Axd0$AMwzl8G zMXkI^_Nwl*GUp3)=)7>OPh!rfGaPwAgH8vnbFU&y<7 z;Q$V?|IxcJSo1C@-ocQQ=Is;czcWlR*e6^mI3(zgj)mTe&!Yr88cJ({26{+O_unM) z=SZLmzk&e3JP{;`4}Em?s;EedhNa}KvHWT|4NEicF@uUNzb>5T%m4Chqqul{f{icW z3wQz?ogW{xROC7q%ozS^=e)en<3sdd=b?p9I~u&R)wGU;ub1<4UP$k#xV`hh$m68_ z?rLAUYe2cX7dvG36D7p`ISb56hl|rQ|NK;q`s}@NLux%%l&9Pi%HwMVwSnImowGIk82_*i znth7E{#w@r3S5xO=W@1vmc8USb;r8sO#bsM$LeOkJ{jiF@qxY(H8YgkY}5`z+q6zp zSF6?i+^$fq#S6t8i;AB!tN31Or=EJ!;u1Q!`-z2%M<-8pQD#%S5L>>}DMVIP2dyrQ zO(hc@5WFv^*R??#mss&#;Lu)M^`NWx*N)b2p+giLI+S}8x$Umf!5E@=7Qd!7b}*x) zpH&Bkll*jJF4gGNQ?5rdcii`?YpCL5%0~?KGs(&3{5by-LWl%6D=!!+ZsS})L1GdC zv8AsX4@DBTZ6EFHxrAp5TXxv1A!RQ%lhP=Un9hwjI+)TctMlnI;giU+o+8DYaipeta9}m z`o_brZCW#Wb+me0~@5|pg8sAs1Z33%x(2F&!AhsH}QxH<UgQpn* z+;NX@vtKilm8t(1o|WiQSnzs3@FW9(F3n?r3>k++7dY_4TOkHTw6n8z(xOqe`vj3N zH(8Y6kEW^tkw`RxenickPl@p5^gsm68%>DQK3|y z0vAo-YAd2Xk!WBGbtgK~Y1l$#Lpm{k`iM39M4|>vZ#SJDotO#SPBe0a9=1e%Gnf#R z8o~Ub54|6eXcGks4bg*|;40ItyUE6c2#N{*r|B8MQmoJmCOO+ty`7->Skw8@UTe62cyq14`{y^iG6`gj*7%1_HjXKiYw>>A`6;sa5o z>o1Bf5B*^RvnCqB>iP%$IY5#T=slrc(8mv6El5s3Sl`h6twMZ);3fDEt&fS?cCyn} zUy5nK&(9xT?^SxFV9URw2dm*9=Kni>3}LOU(sNNE(8wl%Q6wiq1hjrMc>OG(=2h^y zz*(Xlq%e3<{UCk8sqG3R4C#$Tq7vOcAl5{pIb7u$6i9R3N{DcAQVxX6e}nfQmls|y wldY=W#)K$uXZ=X>CUd;2lPwXzFxyZANntc>HRMzL!yj-OdfRUU{U6T%06c=}+W-In diff --git a/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet b/tools/_vega_datasets_data/metadata_v2.5.4-v2.9.0.parquet deleted file mode 100644 index 5626093db560b805b33261bdc5f6b7754ab3451d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11354 zcmeHtc|25Y*#8;p*w+~|_H`0s>`S(>FJl@)!h^xs85ts>$et}(s;3enl|2N!-mSkyBTJWA3R8PppmljUo(yIm2w_ zV1x892FjnH8XygUARqvENMBZ15tV0y0FP3$Bcz#Ltq<^af_m3 z;{_(ioFzo7>L7@|Kf4OCajFkb|7k);BV};VUc_FQ-#vBZuwh4~iH8la3%~=&D863x z(p#7KeNDAS`@#rnfw`Y)@|J22?A&%SlyJy|zWVcu_S;Y8`N+AC54DCQ{X<1(RP5z< z?`6rI<8HaSowMP>viddqH0@a#oU$l?dZL72&GkGchwWnIq-}s&mBkmkT5Q=k&7-$G zhDk#c+S4LdLPHJ}sA(iBN9&Ovd07&j*rRL-6+U5j>xDsIu)3o>Ez;8PZIWAw%M1$9{Urs*Rvqx^Wy#=Mlob)*F^35m#o*otx$2xCrREkC(j`zheUYhor zfVB5jN+kwIbce6STU!l!;IbiYA}zN=q{ZvZdZUh26}3Lc0Jlsc^B<6M+lE9?ViGCv-1S#4iASE5^#X^t4TpML4_l4mvSrDgBzrsMP!7J%{fm zwMW0|tw{c?9_A9`(ZjIsZOPZy>C;~j9~hfCKs4mh!#>ke@xZx>2i8d6U=NZfku2{) z-iv^$gGu?|An(noYYCiU1kSQ=V1)6Hg9qL8r4j7O@$St&8p2u0o(AU~yS0|RD^xW_olJIm>Nl{S* zCvzgLc%cKDKZ8+H{5W1Fj|Ejc6s&ccLt2vmh(#%nOo?NSVd!vED+2Q8p1g@{ z5cL=)`H4s+VDE;lqb7-kOBgH$-bnZbsbJa;E52_XRg2cos*ofF(?+6@!jvR)2ea- z_U^7?=sm+3Pg09yYXeWc>arSuCbo)txII}Db#y3$E%eNm(AxrzfDLe%6?R+qk1;pa zG=Kh{kAzF+o$$r>kMC@gWVaL7)Z$)#opr_DWwLT;{?b2r^_EBRiHV~tm0A+82fCB7 z_nyD_vl`j&e~n?ayZl`q5$QH0RwKuv-)$iEQNc#Een2JYw1fR!#?_PQxKrb4UEW=* z@u|~l8W52g#d5Av%{4b!2sC)luu;!!;UWHqlkSh??%qBVc~6-`RGW*1=B<%%il@z@ znQ+$40Z++WiUUQ=&$=5HO`Zn86$Ehthn7QPJZcYUirvU5=o-7(nb|$tZkm1(IkAH; z>XPbVx$`XWi%sctbgd5vy*l7OgOs_&Ev#e7++rs(bqst-? z?i9&545y%7oxt`3^|AOhj_BQa3>`OeAlB0QCc%&E*+frvoAtB$r*i9?A9<@{OF!zf z;uR&gqx;xQpw+Qgoaqdc-R#?g6d@PQyU5x`gXyrF(A|GJsnnFi{3G>Z5}>0MvNpGk zN2ctzG{23P(2NOs&+C)G&@bW9AikvCWDG5<%om~+u6ne+`6l`Lse&VEfWSy_VD-4K z|InTLx$Ha(H)?L}he7+(Zi_DM-rtRjFOQXWF5BH6d&uY}4bQrB`0$f+fNnYm1RcdO ze0K`zj6nRaoWgITngUCczdMEBMl~;cim}Asox*oiGxBn$n@J5Y<=Rv@Y?R`lln5Q(?{)O2BjGU@Dx3ZVgBY47^#@_Bd>$82o7&& zLyhoI!1y?SNVD6kCwCs@+2!;;Ibfr|pm3JNT7f)rThfv{v)hvnf7SSlfY zgeAjIu&kxPl0{IPa~48{rNN?^AO)7K7R9DMw0*R7WdN#WPN`3Q+ns>%lP*; zF|Xp*P}l^I?cB$A$OZ|;z!## z^RFC9M&hwhSm6ckhGqH!6W}UyS~7%e2E@igIvIdt z<_TF|opl@6JDw6nK1vMkd^rx068}IMl281}OY|Mut5PCog`f?c((rjfGBt1kGI7j)X-fh*&56dB{-Y~aX%<> z*|FwcA(VSw_mbJ&B)Ot#esS8Sboc1cJE!{(KjJZI5xA)kI*c1{F&Q9Ps+5dG-d=cJ z2V*?^c56d)N1D@2^}DF4>Cc?eO}09O`%KKhxO*Bs_-gE(U+6c-Oti9QFz6Jo;pw+b55Fi@@$%fEmL1?oebl+5^4JCN5>$wbpzlWeI zbt!uOO(3Z0`)F9a;9tB%q;Q)$z<)=d9~rDJCDd1D>>{}+C%d>7AwD2!?#p2NkVU!* zk_Y|?vsmkqc_spMuQZ1oyvBV8;$2RmmN= zs{^;YG&(u^L{j5sY4#WRAkK{(t5DX!|FP%tREcLt0y?q-(|2C;kW>8Yf;avuT(upa zcdAEAx#I8R(jIH1wfBByX^_ZU38uQvWYADyZ!EV(WAHR_s?(s&rqUSd(S=O3(y}iGI6Q=DEgGMy6&6XmfKjgoX|p&^5cTBosA})mD-FQPV=QzTJDv=qo3hk1+V3 z2|JnHS64`IIARD=e1FXaF3{p8^XXasOO;YL*~VR^9_WVomDFl|j^u&NKS-EeB3W9r zH*as--Sh6r zrZZ$yS z$~-0I6juVF7vB>!@z{d&;?U(e^kS8p0-17ZrrpFQ8P&U6$X-1BRww|c_uv#o*FZE7x=G=1GX|;1cJ(qE=u2be_#RXZ2P;veje)T%m zHm`1n$W7~gE`CPJA|AOwO5yCqX5%y}@-uCf92|zU8qg02I|9rPsq(U!oAlHNW?T#3 zIR(=q4tUl2@sA`u=9lCt5H&&-J@&cJ^?+Sfm0MeL%IbW+qCvi%$LmBLI%|9(@{)5( zD%@iDjVY%GtG)g4%q?cCIxx@LCA_Nb`S~|rkvlCoEN>E(ur&wpC7dOxk)msISI$pZ z*sVC&>dLn6xxT#bO8!{T3w-l|;YX|m9XqixxGNBI-`q37 zePB0%4&6yElvD~k4*Rmgbj|^8)9>%SFCs&qlv9vgIUlrTq^vsWI*NwM%&cwd) z?az%yAIw|1>U4?8ZDme6BzyeDyF@tU@a7ig|r? z`;zDMl;VnTMP%fSNH5s&=XF9O5eJ;GXYqqI>ZS(L*N@kunuzt{wdV!^) z@RVI;Ke4I`nm?uNde=}#O(!?=UT%%KS4-H;C;UmiukTk4OHFo?c9vb-BR07s_ja#g z>pX(#X>#I&q%AUHl3q_AaDTk(wkNgLqT}N!f3#h*UHS9?3pU-|lHCMZ>5uMPt2{a4 zmHN+K;)(CerBC!q4%J3wm z*?`l>8|oA0F7H7KR0tsklgJ8tiQWY2X`3e@ zlt3nik`=%@3hn_Ucd~+)mpVb6sHU#zp`zxc*HHEF&{R@YQgqkUM0+U`)CsB{ z>IB5Ucm^7+j@DGwbXNzZp*2<1Row{$RWvAFQ$tAwO(1G0qLtK$>Pj07;{R!4yBYIj z4hsozT_4JIqVy^e0)HWaio;Nt(!Ust|5*bL&gYyGTG(qCm0ALu5pl!K3GLw-1d?%2 zVu0`75cwcts6tS1;9uvm{yGa)pv-NB%~%;VkgoTCCIqCg>q35xzgzeaWr5P!D7k}H zTFPizW%MY|TAum&&YUY|miv#^5g!)$qP86bwDu`y^+MygeHF6-;7BMC1_S`ulBLqw z`)>$w72_k`tL5p}wp#4c@MpQcz-8}JxJAZmWqiWFvy42cQYI00i2Z4A@xCVy3PR+R zEV0$9i#G}sF6w_haaG)3sgt3w5X-U%tM$%w&%*J{ym^r;P%t<{tHU2N?3oRW9*{g@ z#tYDyNOGI&;{z5L=9BgpJ%W*YeKQm8k(|~_aWGon)Gafz9Vho@@f3SU^c%>8P1yBq zFFg_x9Z}LD$~txI-7|I`2A+U)qeZtuql0zZ{n5vHnXv5A%UpHyw|gB#2G3>Mc+b;3 ze-!`zWzj*x8;MA7X2=`dEbLJd>pYLlg!bOE&c)u$MGsb#69n!JH+GcKHp(cp4mUSD zN9B!V_IxI*zjS!wn*ZkdO2qMNO(D7074AQ!C&fFslC8aC3bj`7IUd>ShR@EIlxa@v z4Q3}N$e!O{vCy`+Y{D38DoskzvYr_F;9YS`nm)bR(ZAyI;J1^K^4|`|6SN8Eo;V$W zENPGyNLo7A^St?EZBOhb>u7%X;;{4c>q6uB2i18@F|?40mW!eRB&^QG9N&+UFP8&X z#&x;e9rX;o__YynQDsFHvB_!4W|(ZrZV0!EfjB2mx&XIZFo90&3@1&pTf9C7${oiE zo@f#P!m`kZ+;;gpI&nKg4rL#<%@EF3X&#++5jvR|;ofNaRZlWr+3*pjms>R_u(;#I z?UdjP1bJ7%a2X+jmbq5cr!#L}DG5h>l;3^ij+)dXiDa`C3wJSDAC;^b7u5T=uAbN3 z1l}39SU)^I_F61%FDh2B+OLMoszcU-10KY48ex3QO$T>)ckZao>+Y(ff832>TEg%3 zdo8jt-+3>fb!uA?%3AzoqYa>0|E=?id9uZrkhA?1oMG$dV*u9_C77qmQ=mxrhMqv7CB~ zHvC9I4vw$Va-?Y|-N?$C%Q=;2R{i^sK7y+GqnCLc^lpkT6}ca-Qx|o8;JJq4>lP-U5@_>Z!i=|`j`#txKJ~=K$ zDl*ge?29*bH@mWc0)`m9dE5kJ|D$NV zp~47(msK5^7WMuyb6;f$tH(v|;4*oXjwEC6-bzX|XjPB2=|01xR@QhuYUQ!v4DY+n zv*f$p`~{JPjQ7_pC#Shc7L^?iS*!Uk%7;45lG@2>`CeiQPmOh1Tvgs--d+A=rSnm& zXZ|?5l>L_0$06E)3JNl~o$Lr3GOU#Tt&pZ>v>ymEJ(D>jMF%$USd3Ng+UR`|`J?CieEiE!={-L}J&z7Wh45b@;Nca@or%za)#5nT|Hk#!&YpwCo zwDeDB=2A|z$k5MJA3G>fLd?qTX)R#qezR@(Mt9szy;mQDm}I|>?(WOkR`^tsOEKgC ziRbSAiVt~qcHNKqM#7yck%^`PZHBb)*jgQe_UB@^p-)H+t87X2w@kG*c==lTeuM4w zi9JGeX@>4+(;o8C?BKI$h85!wvk*NAn%SdsPUn$GYftZiygB3W`)*uG{HW@s;}3@< zgvuI^BJtLA7hf(uXJ(V=&Na~!pJ)q9xh1}rIS$9q#x^85jop$_dAR-4LW=bv-m64g z@uB z%un`Om-wH2F908;wl9yTs&iKCyE>c3U2po!-A(vBRY(sgL=8aoDEutsw{~OTiRqM{ z#3xXN)M*O-m*VU2(^M&qpcGvIbx-HJD!&_F+ z?Z~9(cL6`ltaYpY-}ycTpQIXK3^YI~Er41)2twq4&1wo?r7EA83V{{T0Ptg(^dQnv zQ267$;0_v65W$MEvDvXpi|pY;3?v}#Fe*?E4HZKS24f8VFdBQobc8kK7EDvKEHM}| z2CM*HtueBAusl4RQf`jHcyH7rfmZ_$O45{)jl&pVFoxD(b&MTVhBcTrq6+h+w%C-9 z!KhQ@?W4-03NxjYV~p*NpWi=^s9J(*G-jWMQ}>lgO31vS&8)&y&T zhc{)kKs~)F_6GZp3nm6qhTy+xd`wk++;`b~d6@bC`~FkLdqW-}$m*ZuQOxj*{(ql8 zMii@U$lI-?Np?34h$QUN3MgX#_1i*mJIpAtsYLJUTM ziVyJ77>ora%P}y3>}anUwtJUi044no^ZobrrHq%Uy^5!aW~8TsLAblJ1=?|!H3mR1 Z;7I|55HhkHBptu_2RfhtfCs-S{tx(skB9&O From 225be0a15520d166bddd162307ff1b82f2552bf7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:33:20 +0000 Subject: [PATCH 029/137] refactor: Rename `metadata_full` -> `metadata` Suffix was only added due to *now-removed* test files --- ...tadata_full-schema.json => metadata-schema.json} | 0 .../{metadata_full.parquet => metadata.parquet} | Bin tools/vendor_datasets.py | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) rename tools/_vega_datasets_data/{metadata_full-schema.json => metadata-schema.json} (100%) rename tools/_vega_datasets_data/{metadata_full.parquet => metadata.parquet} (100%) diff --git a/tools/_vega_datasets_data/metadata_full-schema.json b/tools/_vega_datasets_data/metadata-schema.json similarity index 100% rename from tools/_vega_datasets_data/metadata_full-schema.json rename to tools/_vega_datasets_data/metadata-schema.json diff --git a/tools/_vega_datasets_data/metadata_full.parquet b/tools/_vega_datasets_data/metadata.parquet similarity index 100% rename from tools/_vega_datasets_data/metadata_full.parquet rename to tools/_vega_datasets_data/metadata.parquet diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 208834ebf..45fa27614 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -674,7 +674,7 @@ def __init__( output_dir: Path, *, write_schema: bool, - trees_gh: str = "metadata_full", + trees_gh: str = "metadata", tags_gh: str = "tags", tags_npm: str = "tags_npm", kwds_gh: Mapping[str, Any] | None = None, From e91baab65642dd9b81020b88f50314943d5b15c4 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:42:16 +0000 Subject: [PATCH 030/137] refactor: `tools.vendor_datasets` -> `tools.datasets` package Will be following up with some more splitting into composite modules --- tools/{vendor_datasets.py => datasets/__init__.py} | 2 +- .../_metadata}/metadata-schema.json | 0 .../_metadata}/metadata.parquet | Bin .../_metadata}/tags-schema.json | 0 .../_metadata}/tags.parquet | Bin .../_metadata}/tags_npm-schema.json | 0 .../_metadata}/tags_npm.parquet | Bin 7 files changed, 1 insertion(+), 1 deletion(-) rename tools/{vendor_datasets.py => datasets/__init__.py} (99%) rename tools/{_vega_datasets_data => datasets/_metadata}/metadata-schema.json (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/metadata.parquet (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/tags-schema.json (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/tags.parquet (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/tags_npm-schema.json (100%) rename tools/{_vega_datasets_data => datasets/_metadata}/tags_npm.parquet (100%) diff --git a/tools/vendor_datasets.py b/tools/datasets/__init__.py similarity index 99% rename from tools/vendor_datasets.py rename to tools/datasets/__init__.py index 45fa27614..e27f011f0 100644 --- a/tools/vendor_datasets.py +++ b/tools/datasets/__init__.py @@ -732,7 +732,7 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None json.dump(schema, f, indent=2) -app = Application(Path(__file__).parent / "_vega_datasets_data", write_schema=True) +app = Application(Path(__file__).parent / "_metadata", write_schema=True) def _tag_from(s: str, /) -> str: diff --git a/tools/_vega_datasets_data/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json similarity index 100% rename from tools/_vega_datasets_data/metadata-schema.json rename to tools/datasets/_metadata/metadata-schema.json diff --git a/tools/_vega_datasets_data/metadata.parquet b/tools/datasets/_metadata/metadata.parquet similarity index 100% rename from tools/_vega_datasets_data/metadata.parquet rename to tools/datasets/_metadata/metadata.parquet diff --git a/tools/_vega_datasets_data/tags-schema.json b/tools/datasets/_metadata/tags-schema.json similarity index 100% rename from tools/_vega_datasets_data/tags-schema.json rename to tools/datasets/_metadata/tags-schema.json diff --git a/tools/_vega_datasets_data/tags.parquet b/tools/datasets/_metadata/tags.parquet similarity index 100% rename from tools/_vega_datasets_data/tags.parquet rename to tools/datasets/_metadata/tags.parquet diff --git a/tools/_vega_datasets_data/tags_npm-schema.json b/tools/datasets/_metadata/tags_npm-schema.json similarity index 100% rename from tools/_vega_datasets_data/tags_npm-schema.json rename to tools/datasets/_metadata/tags_npm-schema.json diff --git a/tools/_vega_datasets_data/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet similarity index 100% rename from tools/_vega_datasets_data/tags_npm.parquet rename to tools/datasets/_metadata/tags_npm.parquet From 7782925b3291a8d3b6ff38b5572e3e47c06ebed3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:55:10 +0000 Subject: [PATCH 031/137] refactor: Move `TypedDict`, `NamedTuple`(s) -> `datasets.models` --- tools/datasets/__init__.py | 187 ++++--------------------------------- tools/datasets/models.py | 166 ++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+), 167 deletions(-) create mode 100644 tools/datasets/models.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index e27f011f0..2b87ded3b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -10,7 +10,6 @@ import json import os import random -import sys import tempfile import time import urllib.request @@ -19,27 +18,28 @@ from functools import cached_property, partial from itertools import islice from pathlib import Path -from typing import ( - IO, - TYPE_CHECKING, - Any, - Callable, - ClassVar, - Literal, - NamedTuple, - cast, - get_args, -) +from typing import IO, TYPE_CHECKING, Any, Callable, ClassVar, Literal, cast, get_args from urllib.request import urlopen import polars as pl -if sys.version_info >= (3, 14): - from typing import TypedDict -else: - from typing_extensions import TypedDict +from tools.datasets.models import ( + GitHubRateLimitResources, + GitHubTag, + GitHubTree, + GitHubTreesResponse, + GitHubUrl, + NpmPackageMetadataResponse, + NpmUrl, + ParsedRateLimit, + ParsedTag, + ParsedTree, + QueryTree, + ReParsedTag, +) if TYPE_CHECKING: + import sys from collections.abc import Mapping, MutableMapping from email.message import Message from typing import TypeVar @@ -50,9 +50,9 @@ else: from typing_extensions import TypeIs if sys.version_info >= (3, 11): - from typing import LiteralString, Required + from typing import LiteralString else: - from typing_extensions import LiteralString, Required + from typing_extensions import LiteralString if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -81,153 +81,6 @@ def _is_str(obj: Any) -> TypeIs[str]: return isinstance(obj, str) -class GitHubUrl(NamedTuple): - BASE: LiteralString - RATE: LiteralString - REPO: LiteralString - TAGS: LiteralString - TREES: LiteralString - - -class NpmUrl(NamedTuple): - CDN: LiteralString - TAGS: LiteralString - - -class GitHubTag(TypedDict): - name: str - node_id: str - commit: dict[Literal["sha", "url"], str] - zipball_url: str - tarball_url: str - - -class ParsedTag(TypedDict): - tag: str - sha: str - trees_url: str - - -class ReParsedTag(ParsedTag): - major: int - minor: int - patch: int - pre_release: int | None - is_pre_release: bool - - -class GitHubTree(TypedDict): - """ - A single file's metadata within the response of `Get a tree`_. - - .. _Get a tree: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - """ - - path: str - mode: str - type: str - sha: str - size: int - url: str - - -class GitHubTreesResponse(TypedDict): - """ - Response from `Get a tree`_. - - Describes directory metadata, with files stored in ``"tree"``. - - .. _Get a tree: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - """ - - sha: str - url: str - tree: list[GitHubTree] - truncated: bool - - -class NpmVersion(TypedDict): - version: str - links: dict[Literal["self", "entrypoints", "stats"], str] - - -class NpmPackageMetadataResponse(TypedDict): - """ - Response from `Get package metadata`_. - - Using: - - headers={"Accept": "application/json"} - - .. _Get package metadata: - https://data.jsdelivr.com/v1/packages/npm/vega-datasets - """ - - type: str - name: str - tags: dict[Literal["canary", "next", "latest"], str] - versions: list[NpmVersion] - links: dict[Literal["stats"], str] - - -class ParsedTree(TypedDict): - file_name: str - name_js: str - name_py: str - suffix: str - size: int - url: str - ext_supported: bool - tag: str - - -class QueryTree(TypedDict, total=False): - file_name: str - name_js: Required[str] - name_py: str - suffix: str - size: int - url: str - ext_supported: bool - tag: str - - -class ParsedTreesResponse(TypedDict): - tag: str - url: str - tree: list[ParsedTree] - - -class GitHubRateLimit(TypedDict): - limit: int - used: int - remaining: int - reset: int - - -class ParsedRateLimit(GitHubRateLimit): - reset_time: time.struct_time - is_limited: bool - is_auth: bool - - -class GitHubRateLimitResources(TypedDict, total=False): - """ - A subset of response from `Get rate limit status for the authenticated user`_. - - .. _Get rate limit status for the authenticated user: - https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - """ - - core: Required[GitHubRateLimit] - search: Required[GitHubRateLimit] - graphql: GitHubRateLimit - integration_manifest: GitHubRateLimit - code_search: GitHubRateLimit - - class _ErrorHandler(urllib.request.BaseHandler): """ Adds `rate limit`_ info to a forbidden error. @@ -608,6 +461,8 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: class _Npm: + """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" + def __init__( self, output_dir: Path, @@ -958,8 +813,6 @@ def __getattr__(self, name: str) -> Dataset: def __dir__(self) -> list[str]: return self.list_datasets() - # BUG: # 1.6.0 exists on GH but not npm? - # https://www.jsdelivr.com/docs/data.jsdelivr.com#overview def __call__( self, name: str, diff --git a/tools/datasets/models.py b/tools/datasets/models.py new file mode 100644 index 000000000..5a6598fed --- /dev/null +++ b/tools/datasets/models.py @@ -0,0 +1,166 @@ +"""API-related data structures.""" + +from __future__ import annotations + +import sys +from typing import TYPE_CHECKING, Literal, NamedTuple + +if sys.version_info >= (3, 14): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +if TYPE_CHECKING: + import time + + if sys.version_info >= (3, 11): + from typing import LiteralString, Required + else: + from typing_extensions import LiteralString, Required + + +class GitHubUrl(NamedTuple): + BASE: LiteralString + RATE: LiteralString + REPO: LiteralString + TAGS: LiteralString + TREES: LiteralString + + +class NpmUrl(NamedTuple): + CDN: LiteralString + TAGS: LiteralString + + +class GitHubTag(TypedDict): + name: str + node_id: str + commit: dict[Literal["sha", "url"], str] + zipball_url: str + tarball_url: str + + +class ParsedTag(TypedDict): + tag: str + sha: str + trees_url: str + + +class ReParsedTag(ParsedTag): + major: int + minor: int + patch: int + pre_release: int | None + is_pre_release: bool + + +class GitHubTree(TypedDict): + """ + A single file's metadata within the response of `Get a tree`_. + + .. _Get a tree: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + """ + + path: str + mode: str + type: str + sha: str + size: int + url: str + + +class GitHubTreesResponse(TypedDict): + """ + Response from `Get a tree`_. + + Describes directory metadata, with files stored in ``"tree"``. + + .. _Get a tree: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + """ + + sha: str + url: str + tree: list[GitHubTree] + truncated: bool + + +class NpmVersion(TypedDict): + version: str + links: dict[Literal["self", "entrypoints", "stats"], str] + + +class NpmPackageMetadataResponse(TypedDict): + """ + Response from `Get package metadata`_. + + Using: + + headers={"Accept": "application/json"} + + .. _Get package metadata: + https://data.jsdelivr.com/v1/packages/npm/vega-datasets + """ + + type: str + name: str + tags: dict[Literal["canary", "next", "latest"], str] + versions: list[NpmVersion] + links: dict[Literal["stats"], str] + + +class ParsedTree(TypedDict): + file_name: str + name_js: str + name_py: str + suffix: str + size: int + url: str + ext_supported: bool + tag: str + + +class QueryTree(TypedDict, total=False): + file_name: str + name_js: Required[str] + name_py: str + suffix: str + size: int + url: str + ext_supported: bool + tag: str + + +class ParsedTreesResponse(TypedDict): + tag: str + url: str + tree: list[ParsedTree] + + +class GitHubRateLimit(TypedDict): + limit: int + used: int + remaining: int + reset: int + + +class ParsedRateLimit(GitHubRateLimit): + reset_time: time.struct_time + is_limited: bool + is_auth: bool + + +class GitHubRateLimitResources(TypedDict, total=False): + """ + A subset of response from `Get rate limit status for the authenticated user`_. + + .. _Get rate limit status for the authenticated user: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user + """ + + core: Required[GitHubRateLimit] + search: Required[GitHubRateLimit] + graphql: GitHubRateLimit + integration_manifest: GitHubRateLimit + code_search: GitHubRateLimit From bc86ca18101e9e688caec7ea5e66afc2810ef993 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:16:05 +0000 Subject: [PATCH 032/137] refactor: Move, rename `semver`-related tools --- tools/datasets/__init__.py | 55 ++++++------------------------------ tools/datasets/semver.py | 57 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 46 deletions(-) create mode 100644 tools/datasets/semver.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 2b87ded3b..ce61dbbe7 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -23,6 +23,7 @@ import polars as pl +from tools.datasets import semver from tools.datasets.models import ( GitHubRateLimitResources, GitHubTag, @@ -42,7 +43,6 @@ import sys from collections.abc import Mapping, MutableMapping from email.message import Message - from typing import TypeVar from urllib.request import OpenerDirector, Request if sys.version_info >= (3, 13): @@ -59,7 +59,6 @@ from typing_extensions import TypeAlias from tools.schemapi.utils import OneOrSeq - _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) _PathName: TypeAlias = Literal["dir", "tags", "trees"] WorkInProgress: TypeAlias = Any @@ -71,10 +70,6 @@ _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" _SUB_DIR = "data" -_SEM_VER_FIELDS: tuple[ - Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] -] = "major", "minor", "patch", "pre_release" -_CANARY: Literal["--canary"] = "--canary" def _is_str(obj: Any) -> TypeIs[str]: @@ -350,7 +345,7 @@ def tags( self, n_head: int | None = None, *, warn_lower: bool = False ) -> pl.DataFrame: tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) - return pl.DataFrame(self.parse.tags(tags)).pipe(_with_sem_ver) + return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: """Retrieve directory info for a given version ``tag``.""" @@ -398,7 +393,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: f"Finished collection.\n" f"Writing {fresh_rows.height} new rows to {fp!s}" ) - return pl.concat((trees, fresh_rows)).pipe(_sort_sem_ver) + return pl.concat((trees, fresh_rows)).pipe(semver.sort) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) @@ -421,7 +416,7 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: latest = ( self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() ) - if latest.equals(prev.pipe(_sort_sem_ver).head(1).collect()): + if latest.equals(prev.pipe(semver.sort).head(1).collect()): print(f"Already up-to-date {fp!s}") return prev.collect() print(f"Refreshing {fp!s}") @@ -429,16 +424,14 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: tags = ( pl.concat((self.tags(), prev_eager), how="vertical") .unique("sha") - .pipe(_sort_sem_ver) + .pipe(semver.sort) ) print(f"Collected {tags.height - prev_eager.height} new tags") return tags def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: - rate_limit = self.rate_limit() - if rate_limit["is_limited"]: - raise NotImplementedError(rate_limit) - elif not isinstance(tags, Sequence): + rate_limit = self.rate_limit(strict=True) + if not isinstance(tags, Sequence): tags = tuple(tags) req = self.req n = len(tags) @@ -511,9 +504,9 @@ def tags(self) -> pl.DataFrame: versions = [ f"v{tag}" for v in content["versions"] - if (tag := v["version"]) and _CANARY not in tag + if (tag := v["version"]) and semver.CANARY not in tag ] - return pl.DataFrame({"tag": versions}).pipe(_with_sem_ver) + return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) class Application: @@ -606,36 +599,6 @@ def _tag_from(s: str, /) -> str: raise TypeError(s) -def _with_sem_ver(df: pl.DataFrame, *, col_tag: str = "tag") -> pl.DataFrame: - """ - Extracts components of a `SemVer`_ string into sortable columns. - - .. _SemVer: - https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions - """ - fields = pl.col(_SEM_VER_FIELDS) - pattern = r"""(?x) - v?(?[[:digit:]]*)\. - (?[[:digit:]]*)\. - (?[[:digit:]]*) - (\-(next)?(beta)?\.)? - (?[[:digit:]]*)? - """ - sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) - return ( - df.lazy() - .with_columns(sem_ver) - .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) - .with_columns(is_pre_release=pl.col("pre_release").is_not_null()) - .collect() - ) - - -def _sort_sem_ver(frame: _Frame, /) -> _Frame: - """Sort ``frame``, displaying in descending release order.""" - return frame.sort(_SEM_VER_FIELDS, descending=True) - - # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py new file mode 100644 index 000000000..cb4c6c799 --- /dev/null +++ b/tools/datasets/semver.py @@ -0,0 +1,57 @@ +""" +Parsing/transforming semantic versioning strings. + +.. _semantic versioning: + https://semver.org/ +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +import polars as pl + +if TYPE_CHECKING: + from typing import TypeVar + + _Frame = TypeVar("_Frame", pl.DataFrame, pl.LazyFrame) + +__all__ = ["CANARY", "sort", "with_columns"] + +_SEM_VER_FIELDS: tuple[ + Literal["major"], Literal["minor"], Literal["patch"], Literal["pre_release"] +] = "major", "minor", "patch", "pre_release" +CANARY: Literal["--canary"] = "--canary" + + +def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: + """ + Extracts components of a `SemVer`_ string into sortable columns. + + .. _SemVer: + https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions + """ + fields = pl.col(_SEM_VER_FIELDS) + pattern = r"""(?x) + v?(?[[:digit:]]*)\. + (?[[:digit:]]*)\. + (?[[:digit:]]*) + (\-(next)?(beta)?\.)? + (?[[:digit:]]*)? + """ + sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) + ldf = ( + frame.lazy() + .with_columns(sem_ver) + .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) + .with_columns(is_pre_release=pl.col("pre_release").is_not_null()) + ) + if isinstance(frame, pl.DataFrame): + return ldf.collect() + else: + return ldf + + +def sort(frame: _Frame, /) -> _Frame: + """Sort ``frame``, displaying in descending release order.""" + return frame.sort(_SEM_VER_FIELDS, descending=True) From a6f56452df200ef2049aa3203e79f1d70005a198 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:19:13 +0000 Subject: [PATCH 033/137] refactor: Remove `write_schema` from `_Npm`, `_GitHub` Handled in `Application` now --- tools/datasets/__init__.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index ce61dbbe7..e26472c2f 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -296,14 +296,10 @@ def __init__( name_tags: str, name_trees: str, *, - write_schema: bool, base_url: LiteralString = "https://api.github.com/", org: LiteralString = "vega", package: LiteralString = "vega-datasets", ) -> None: - # When ``write_schema``, addtional ``...-schema.json`` file(s) are produced - # that describes column types - in a non-binary format. - self._write_schema: bool = write_schema output_dir.mkdir(exist_ok=True) self._paths: dict[_PathName, Path] = { "dir": output_dir, @@ -461,13 +457,11 @@ def __init__( output_dir: Path, name_tags: str, *, - write_schema: bool, jsdelivr: Literal["jsdelivr"] = "jsdelivr", npm: Literal["npm"] = "npm", package: LiteralString = "vega-datasets", jsdelivr_version: LiteralString = "v1", ) -> None: - self._write_schema: bool = write_schema output_dir.mkdir(exist_ok=True) self._paths: dict[Literal["tags"], Path] = { "tags": output_dir / f"{name_tags}.parquet" @@ -533,18 +527,9 @@ def __init__( kwds_npm = kwds_npm or {} self._write_schema: bool = write_schema self._github: _GitHub = _GitHub( - output_dir, - name_tags=tags_gh, - name_trees=trees_gh, - write_schema=write_schema, - **kwds_gh, - ) - self._npm: _Npm = _Npm( - output_dir, - name_tags=tags_npm, - write_schema=write_schema, - **kwds_npm, + output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh ) + self._npm: _Npm = _Npm(output_dir, name_tags=tags_npm, **kwds_npm) @property def github(self) -> _GitHub: From 07a8342c95544fbbacff808f8d4d3868a1215a2c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:00:12 +0000 Subject: [PATCH 034/137] refactor: Rename, split `_Npm`, `_GitHub` into own modules `tools.datasets.npm` will later be performing the requests that are in `Dataset.__call__` currently --- tools/datasets/__init__.py | 497 +------------------------------------ tools/datasets/github.py | 455 +++++++++++++++++++++++++++++++++ tools/datasets/npm.py | 76 ++++++ 3 files changed, 541 insertions(+), 487 deletions(-) create mode 100644 tools/datasets/github.py create mode 100644 tools/datasets/npm.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index e26472c2f..bcbe725a1 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -8,42 +8,21 @@ from __future__ import annotations import json -import os -import random import tempfile -import time -import urllib.request -import warnings -from collections.abc import Iterable, Iterator, Sequence from functools import cached_property, partial -from itertools import islice from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Callable, ClassVar, Literal, cast, get_args +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, get_args from urllib.request import urlopen import polars as pl -from tools.datasets import semver -from tools.datasets.models import ( - GitHubRateLimitResources, - GitHubTag, - GitHubTree, - GitHubTreesResponse, - GitHubUrl, - NpmPackageMetadataResponse, - NpmUrl, - ParsedRateLimit, - ParsedTag, - ParsedTree, - QueryTree, - ReParsedTag, -) +from tools.datasets.github import GitHub +from tools.datasets.models import QueryTree +from tools.datasets.npm import Npm if TYPE_CHECKING: import sys - from collections.abc import Mapping, MutableMapping - from email.message import Message - from urllib.request import OpenerDirector, Request + from collections.abc import Mapping if sys.version_info >= (3, 13): from typing import TypeIs @@ -57,450 +36,10 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - from tools.schemapi.utils import OneOrSeq - _PathName: TypeAlias = Literal["dir", "tags", "trees"] WorkInProgress: TypeAlias = Any - -_ItemSlice: TypeAlias = ( - "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" -) -"""Query result scalar selection.""" - -_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" -_SUB_DIR = "data" - - -def _is_str(obj: Any) -> TypeIs[str]: - return isinstance(obj, str) - - -class _ErrorHandler(urllib.request.BaseHandler): - """ - Adds `rate limit`_ info to a forbidden error. - - .. _rate limit: - https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28 - """ - - def http_error_default( - self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message - ): - if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)): - limit = hdrs.get("X-RateLimit-Limit", "") - remaining = hdrs.get("X-RateLimit-Remaining", "") - msg = ( - f"{msg}\n\nFailed to balance rate limit.\n" - f"{limit=}, {remaining=}\n" - f"Reset: {time.localtime(int(reset))!r}" - ) - raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp) - - -class _GitHubRequestNamespace: - """ - Fetching resources from the `GitHub API`_. - - .. _GitHub API: - https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28 - """ - - _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN" - _TAGS_MAX_PAGE: Literal[100] = 100 - _VERSION: LiteralString = "2022-11-28" - _UNAUTH_RATE_LIMIT: Literal[60] = 60 - _TAGS_COST: Literal[1] = 1 - _TREES_COST: Literal[2] = 2 - _UNAUTH_DELAY: Literal[5] = 5 - _AUTH_DELAY: Literal[1] = 1 - _UNAUTH_TREES_LIMIT: Literal[10] = 10 - - def __init__(self, gh: _GitHub, /) -> None: - self._gh = gh - - @property - def url(self) -> GitHubUrl: - return self._gh.url - - def rate_limit(self) -> GitHubRateLimitResources: - """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user.""" - with self._gh._opener.open(self._request(self.url.RATE)) as response: - content: GitHubRateLimitResources = json.load(response)["resources"] - return content - - def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: - """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" - if n < 1 or n > self._TAGS_MAX_PAGE: - raise ValueError(n) - req = self._request(f"{self.url.TAGS}?per_page={n}") - with self._gh._opener.open(req) as response: - content: list[GitHubTag] = json.load(response) - if warn_lower and len(content) < n: - earliest = response[-1]["name"] - n_response = len(content) - msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" - warnings.warn(msg, stacklevel=3) - return content - - def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: - """ - For a given ``tag``, perform **2x requests** to get directory metadata. - - Returns response unchanged - but with annotations. - """ - if _is_str(tag): - url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" - else: - url = tag["trees_url"] - with self._gh._opener.open(self._request(url)) as response: - content: GitHubTreesResponse = json.load(response) - query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) - if data_url := next(query, None): - with self._gh._opener.open(self._request(data_url)) as response: - data_dir: GitHubTreesResponse = json.load(response) - return data_dir - else: - raise FileNotFoundError - - def _request(self, url: str, /, *, raw: bool = False) -> Request: - """ - Wrap a request url with a `personal access token`_ - if set as an env var. - - By default the endpoint returns json, specify raw to get blob data. - See `Media types`_. - - .. _personal access token: - https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens - .. _Media types: - https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types - """ - headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} - if tok := os.environ.get(self._ENV_VAR): - headers["Authorization"] = ( - tok if tok.startswith("Bearer ") else f"Bearer {tok}" - ) - if raw: - headers["Accept"] = "application/vnd.github.raw+json" - return urllib.request.Request(url, headers=headers) - - -class _GitHubParseNamespace: - """ - Transform responses into intermediate representations. - - Where relevant: - - Adding cheap to compute metadata - - Dropping information that we don't need for the task - """ - - def __init__(self, gh: _GitHub, /) -> None: - self._gh = gh - - @property - def url(self) -> GitHubUrl: - return self._gh.url - - def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit: - core = rate_limit["core"] - reset = core["reset"] - return ParsedRateLimit( - **core, - reset_time=time.localtime(reset), - is_limited=core["remaining"] == 0, - is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT, - ) - - def tag(self, tag: GitHubTag, /) -> ParsedTag: - sha = tag["commit"]["sha"] - return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}") - - def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]: - return [self.tag(t) for t in tags] - - def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree: - """For a single tree (file) convert to an IR with only relevant properties.""" - path = Path(tree["path"]) - return ParsedTree( - file_name=path.name, - name_js=path.stem, - name_py=_js_to_py(path.stem), - suffix=path.suffix, - size=tree["size"], - url=tree["url"], - ext_supported=is_ext_supported(path.suffix), - tag=tag, - ) - - def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]: - """For a tree response (directory of files) convert to an IR with only relevant properties.""" - return [self.tree(t, tag) for t in tree["tree"]] - - -class _GitHubQueryNamespace: - """**WIP** Interfacing with the cached metadata.""" - - def __init__(self, gh: _GitHub, /) -> None: - self._gh = gh - - @property - def paths(self) -> dict[_PathName, Path]: - return self._gh._paths - - def url_from( - self, - *predicates: OneOrSeq[str | pl.Expr], - item: _ItemSlice = (0, "url_npm"), - **constraints: Any, - ) -> str: - """Querying multi-version trees metadata for `npm` url to fetch.""" - fp = self.paths["trees"] - if fp.suffix != ".parquet": - raise NotImplementedError(fp.suffix) - items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() - if items.is_empty(): - msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" - raise NotImplementedError(msg) - r = items.item(*item) - if _is_str(r): - return r - else: - msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." - raise TypeError(msg) - - -class _GitHub: - """ - Primary interface with the GitHub API. - - Maintains up-to-date metadata, describing **every** available dataset across **all known** releases. - - - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. - - Organizes distinct groups of operations into property accessor namespaces. - - - .. _tags: - https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags - .. _trees: - https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree - .. _rate_limit: - https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - - """ - - _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) - - def __init__( - self, - output_dir: Path, - name_tags: str, - name_trees: str, - *, - base_url: LiteralString = "https://api.github.com/", - org: LiteralString = "vega", - package: LiteralString = "vega-datasets", - ) -> None: - output_dir.mkdir(exist_ok=True) - self._paths: dict[_PathName, Path] = { - "dir": output_dir, - "tags": output_dir / f"{name_tags}.parquet", - "trees": output_dir / f"{name_trees}.parquet", - } - repo = f"{base_url}repos/{org}/{package}/" - self._url = GitHubUrl( - BASE=base_url, - RATE=f"{base_url}rate_limit", - REPO=repo, - TAGS=f"{repo}tags", - TREES=f"{repo}git/trees/", - ) - - @property - def req(self) -> _GitHubRequestNamespace: - return _GitHubRequestNamespace(self) - - @property - def parse(self) -> _GitHubParseNamespace: - return _GitHubParseNamespace(self) - - @property - def query(self) -> _GitHubQueryNamespace: - return _GitHubQueryNamespace(self) - - @property - def url(self) -> GitHubUrl: - return self._url - - def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: - limit = self.parse.rate_limit(self.req.rate_limit()) - if strict and limit["is_limited"]: - raise NotImplementedError(limit) - return limit - - def tags( - self, n_head: int | None = None, *, warn_lower: bool = False - ) -> pl.DataFrame: - tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) - return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) - - def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: - """Retrieve directory info for a given version ``tag``.""" - trees = self.req.trees(tag) - tag_v = _tag_from(tag) if _is_str(tag) else tag["tag"] - parsed = self.parse.trees(trees, tag=tag_v) - df = ( - pl.DataFrame(parsed) - .lazy() - .rename({"url": "url_github"}) - .with_columns(name_collision=pl.col("name_py").is_duplicated()) - .with_columns( - url_npm=pl.concat_str( - pl.lit(_NPM_BASE_URL), - pl.col("tag"), - pl.lit(f"/{_SUB_DIR}/"), - pl.col("file_name"), - ) - ) - .collect() - ) - return df.select(*sorted(df.columns)) - - def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: - """ - Use known tags to discover and update missing trees metadata. - - Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. - """ - rate_limit = self.rate_limit(strict=True) - fp = self._paths["trees"] - trees = pl.read_parquet(fp) - missing_trees = gh_tags.join( - trees.select(pl.col("tag").unique()), on="tag", how="anti" - ) - if missing_trees.is_empty(): - print(f"Already up-to-date {fp!s}") - return trees - else: - stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT - it = islice(missing_trees.iter_rows(named=True), stop) - missing = cast("Iterator[ReParsedTag]", it) - fresh_rows = self._trees_batched(missing) - print( - f"Finished collection.\n" - f"Writing {fresh_rows.height} new rows to {fp!s}" - ) - return pl.concat((trees, fresh_rows)).pipe(semver.sort) - - def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: - limit = self.rate_limit(strict=True) - npm_tag_only = npm_tags.lazy().select("tag") - fp = self._paths["tags"] - if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: - return ( - pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect() - ) - elif not fp.exists(): - print(f"Initializing {fp!s}") - tags = ( - self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect() - ) - print(f"Collected {tags.height} new tags") - return tags - else: - print("Checking for new tags") - prev = pl.scan_parquet(fp) - latest = ( - self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() - ) - if latest.equals(prev.pipe(semver.sort).head(1).collect()): - print(f"Already up-to-date {fp!s}") - return prev.collect() - print(f"Refreshing {fp!s}") - prev_eager = prev.collect() - tags = ( - pl.concat((self.tags(), prev_eager), how="vertical") - .unique("sha") - .pipe(semver.sort) - ) - print(f"Collected {tags.height - prev_eager.height} new tags") - return tags - - def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: - rate_limit = self.rate_limit(strict=True) - if not isinstance(tags, Sequence): - tags = tuple(tags) - req = self.req - n = len(tags) - cost = req._TREES_COST * n - if rate_limit["remaining"] < cost: - raise NotImplementedError(rate_limit, cost) - delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY - print( - f"Collecting metadata for {n} missing releases.\n" - f"Using {delay_secs=} between requests ..." - ) - dfs: list[pl.DataFrame] = [] - for tag in tags: - time.sleep(delay_secs + random.triangular()) - dfs.append(self.trees(tag)) - return pl.concat(dfs) - - -####################################################################################### - - -class _Npm: - """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" - - def __init__( - self, - output_dir: Path, - name_tags: str, - *, - jsdelivr: Literal["jsdelivr"] = "jsdelivr", - npm: Literal["npm"] = "npm", - package: LiteralString = "vega-datasets", - jsdelivr_version: LiteralString = "v1", - ) -> None: - output_dir.mkdir(exist_ok=True) - self._paths: dict[Literal["tags"], Path] = { - "tags": output_dir / f"{name_tags}.parquet" - } - self._url: NpmUrl = NpmUrl( - CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", - TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", - ) - - @property - def url(self) -> NpmUrl: - return self._url - - def tags(self) -> pl.DataFrame: - """ - Request, parse tags from `Get package metadata`_. - - Notes - ----- - - Ignores canary releases - - ``npm`` can accept either, but this endpoint returns without "v": - - {tag} - v{tag} - - .. _Get package metadata: - https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package- - """ - req = urllib.request.Request( - self.url.TAGS, headers={"Accept": "application/json"} - ) - with urllib.request.urlopen(req) as response: - content: NpmPackageMetadataResponse = json.load(response) - versions = [ - f"v{tag}" - for v in content["versions"] - if (tag := v["version"]) and semver.CANARY not in tag - ] - return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) +__all__ = ["app", "data"] class Application: @@ -526,17 +65,17 @@ def __init__( kwds_gh = kwds_gh or {} kwds_npm = kwds_npm or {} self._write_schema: bool = write_schema - self._github: _GitHub = _GitHub( + self._github: GitHub = GitHub( output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh ) - self._npm: _Npm = _Npm(output_dir, name_tags=tags_npm, **kwds_npm) + self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm) @property - def github(self) -> _GitHub: + def github(self) -> GitHub: return self._github @property - def npm(self) -> _Npm: + def npm(self) -> Npm: return self._npm def refresh(self) -> pl.DataFrame: @@ -568,22 +107,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None app = Application(Path(__file__).parent / "_metadata", write_schema=True) -def _tag_from(s: str, /) -> str: - # - Actual tag - # - Trees url (using ref name) - # - npm url (works w/o the `v` prefix) - trees_url = app.github.url.TREES - if s.startswith("v"): - return s - elif s.startswith(trees_url): - return s.replace(trees_url, "") - elif s.startswith(_NPM_BASE_URL): - s, _ = s.replace(_NPM_BASE_URL, "").split("/") - return s if s.startswith("v") else f"v{s}" - else: - raise TypeError(s) - - # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago diff --git a/tools/datasets/github.py b/tools/datasets/github.py new file mode 100644 index 000000000..e245b91b1 --- /dev/null +++ b/tools/datasets/github.py @@ -0,0 +1,455 @@ +from __future__ import annotations + +import json +import os +import random +import time +import urllib.request +import warnings +from collections.abc import Iterable, Iterator, Sequence +from itertools import islice +from pathlib import Path +from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, cast + +import polars as pl + +from tools.datasets import semver +from tools.datasets.models import ( + GitHubRateLimitResources, + GitHubTag, + GitHubTree, + GitHubTreesResponse, + GitHubUrl, + ParsedRateLimit, + ParsedTag, + ParsedTree, +) + +if TYPE_CHECKING: + import sys + from collections.abc import MutableMapping + from email.message import Message + from urllib.request import OpenerDirector, Request + + from tools.datasets import ExtSupported + from tools.datasets.models import ReParsedTag + from tools.schemapi.utils import OneOrSeq + + if sys.version_info >= (3, 13): + from typing import TypeIs + else: + from typing_extensions import TypeIs + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + + _PathName: TypeAlias = Literal["dir", "tags", "trees"] + +__all__ = ["GitHub"] + +_ItemSlice: TypeAlias = ( + "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" +) +"""Query result scalar selection.""" + +# TODO: Work on where these should live/be accessed +_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" +_SUB_DIR = "data" + + +def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: + return suffix in {".csv", ".json", ".tsv", ".arrow"} + + +def _is_str(obj: Any) -> TypeIs[str]: + return isinstance(obj, str) + + +class _ErrorHandler(urllib.request.BaseHandler): + """ + Adds `rate limit`_ info to a forbidden error. + + .. _rate limit: + https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28 + """ + + def http_error_default( + self, req: Request, fp: IO[bytes] | None, code: int, msg: str, hdrs: Message + ): + if code == 403 and (reset := hdrs.get("X-RateLimit-Reset", None)): + limit = hdrs.get("X-RateLimit-Limit", "") + remaining = hdrs.get("X-RateLimit-Remaining", "") + msg = ( + f"{msg}\n\nFailed to balance rate limit.\n" + f"{limit=}, {remaining=}\n" + f"Reset: {time.localtime(int(reset))!r}" + ) + raise urllib.request.HTTPError(req.full_url, code, msg, hdrs, fp) + + +class _GitHubRequestNamespace: + """ + Fetching resources from the `GitHub API`_. + + .. _GitHub API: + https://docs.github.com/en/rest/about-the-rest-api/about-the-rest-api?apiVersion=2022-11-28 + """ + + _ENV_VAR: LiteralString = "VEGA_GITHUB_TOKEN" + _TAGS_MAX_PAGE: Literal[100] = 100 + _VERSION: LiteralString = "2022-11-28" + _UNAUTH_RATE_LIMIT: Literal[60] = 60 + _TAGS_COST: Literal[1] = 1 + _TREES_COST: Literal[2] = 2 + _UNAUTH_DELAY: Literal[5] = 5 + _AUTH_DELAY: Literal[1] = 1 + _UNAUTH_TREES_LIMIT: Literal[10] = 10 + + def __init__(self, gh: GitHub, /) -> None: + self._gh = gh + + @property + def url(self) -> GitHubUrl: + return self._gh.url + + def rate_limit(self) -> GitHubRateLimitResources: + """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user.""" + with self._gh._opener.open(self._request(self.url.RATE)) as response: + content: GitHubRateLimitResources = json.load(response)["resources"] + return content + + def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: + """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" + if n < 1 or n > self._TAGS_MAX_PAGE: + raise ValueError(n) + req = self._request(f"{self.url.TAGS}?per_page={n}") + with self._gh._opener.open(req) as response: + content: list[GitHubTag] = json.load(response) + if warn_lower and len(content) < n: + earliest = response[-1]["name"] + n_response = len(content) + msg = f"Requested {n=} tags, but got {n_response}\n" f"{earliest=}" + warnings.warn(msg, stacklevel=3) + return content + + def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: + """ + For a given ``tag``, perform **2x requests** to get directory metadata. + + Returns response unchanged - but with annotations. + """ + if _is_str(tag): + url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" + else: + url = tag["trees_url"] + with self._gh._opener.open(self._request(url)) as response: + content: GitHubTreesResponse = json.load(response) + query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) + if data_url := next(query, None): + with self._gh._opener.open(self._request(data_url)) as response: + data_dir: GitHubTreesResponse = json.load(response) + return data_dir + else: + raise FileNotFoundError + + def _request(self, url: str, /, *, raw: bool = False) -> Request: + """ + Wrap a request url with a `personal access token`_ - if set as an env var. + + By default the endpoint returns json, specify raw to get blob data. + See `Media types`_. + + .. _personal access token: + https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens + .. _Media types: + https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types + """ + headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} + if tok := os.environ.get(self._ENV_VAR): + headers["Authorization"] = ( + tok if tok.startswith("Bearer ") else f"Bearer {tok}" + ) + if raw: + headers["Accept"] = "application/vnd.github.raw+json" + return urllib.request.Request(url, headers=headers) + + +class _GitHubParseNamespace: + """ + Transform responses into intermediate representations. + + Where relevant: + - Adding cheap to compute metadata + - Dropping information that we don't need for the task + """ + + def __init__(self, gh: GitHub, /) -> None: + self._gh = gh + + @property + def url(self) -> GitHubUrl: + return self._gh.url + + def rate_limit(self, rate_limit: GitHubRateLimitResources, /) -> ParsedRateLimit: + core = rate_limit["core"] + reset = core["reset"] + return ParsedRateLimit( + **core, + reset_time=time.localtime(reset), + is_limited=core["remaining"] == 0, + is_auth=core["limit"] > self._gh.req._UNAUTH_RATE_LIMIT, + ) + + def tag(self, tag: GitHubTag, /) -> ParsedTag: + sha = tag["commit"]["sha"] + return ParsedTag(tag=tag["name"], sha=sha, trees_url=f"{self.url.TREES}{sha}") + + def tags(self, tags: list[GitHubTag], /) -> list[ParsedTag]: + return [self.tag(t) for t in tags] + + def tree(self, tree: GitHubTree, tag: str, /) -> ParsedTree: + """For a single tree (file) convert to an IR with only relevant properties.""" + path = Path(tree["path"]) + return ParsedTree( + file_name=path.name, + name_js=path.stem, + name_py=path.stem.replace("-", "_"), + suffix=path.suffix, + size=tree["size"], + url=tree["url"], + ext_supported=is_ext_supported(path.suffix), + tag=tag, + ) + + def trees(self, tree: GitHubTreesResponse, /, tag: str) -> list[ParsedTree]: + """For a tree response (directory of files) convert to an IR with only relevant properties.""" + return [self.tree(t, tag) for t in tree["tree"]] + + def tag_from_str(self, s: str, /) -> str: + # - Actual tag + # - Trees url (using ref name) + # - npm url (works w/o the `v` prefix) + trees_url = self.url.TREES + if s.startswith("v"): + return s + elif s.startswith(trees_url): + return s.replace(trees_url, "") + elif s.startswith(_NPM_BASE_URL): + s, _ = s.replace(_NPM_BASE_URL, "").split("/") + return s if s.startswith("v") else f"v{s}" + else: + raise TypeError(s) + + +class _GitHubQueryNamespace: + """**WIP** Interfacing with the cached metadata.""" + + def __init__(self, gh: GitHub, /) -> None: + self._gh = gh + + @property + def paths(self) -> dict[_PathName, Path]: + return self._gh._paths + + def url_from( + self, + *predicates: OneOrSeq[str | pl.Expr], + item: _ItemSlice = (0, "url_npm"), + **constraints: Any, + ) -> str: + """Querying multi-version trees metadata for `npm` url to fetch.""" + fp = self.paths["trees"] + if fp.suffix != ".parquet": + raise NotImplementedError(fp.suffix) + items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() + if items.is_empty(): + msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" + raise NotImplementedError(msg) + r = items.item(*item) + if _is_str(r): + return r + else: + msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." + raise TypeError(msg) + + +class GitHub: + """ + Primary interface with the GitHub API. + + Maintains up-to-date metadata, describing **every** available dataset across **all known** releases. + + - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. + - Organizes distinct groups of operations into property accessor namespaces. + + + .. _tags: + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags + .. _trees: + https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree + .. _rate_limit: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user + + """ + + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) + + def __init__( + self, + output_dir: Path, + name_tags: str, + name_trees: str, + *, + base_url: LiteralString = "https://api.github.com/", + org: LiteralString = "vega", + package: LiteralString = "vega-datasets", + ) -> None: + output_dir.mkdir(exist_ok=True) + self._paths: dict[_PathName, Path] = { + "dir": output_dir, + "tags": output_dir / f"{name_tags}.parquet", + "trees": output_dir / f"{name_trees}.parquet", + } + repo = f"{base_url}repos/{org}/{package}/" + self._url = GitHubUrl( + BASE=base_url, + RATE=f"{base_url}rate_limit", + REPO=repo, + TAGS=f"{repo}tags", + TREES=f"{repo}git/trees/", + ) + + @property + def req(self) -> _GitHubRequestNamespace: + return _GitHubRequestNamespace(self) + + @property + def parse(self) -> _GitHubParseNamespace: + return _GitHubParseNamespace(self) + + @property + def query(self) -> _GitHubQueryNamespace: + return _GitHubQueryNamespace(self) + + @property + def url(self) -> GitHubUrl: + return self._url + + def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: + limit = self.parse.rate_limit(self.req.rate_limit()) + if strict and limit["is_limited"]: + raise NotImplementedError(limit) + return limit + + def tags( + self, n_head: int | None = None, *, warn_lower: bool = False + ) -> pl.DataFrame: + tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) + return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) + + def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: + """Retrieve directory info for a given version ``tag``.""" + trees = self.req.trees(tag) + tag_v = self.parse.tag_from_str(tag) if _is_str(tag) else tag["tag"] + parsed = self.parse.trees(trees, tag=tag_v) + df = ( + pl.DataFrame(parsed) + .lazy() + .rename({"url": "url_github"}) + .with_columns(name_collision=pl.col("name_py").is_duplicated()) + .with_columns( + url_npm=pl.concat_str( + pl.lit(_NPM_BASE_URL), + pl.col("tag"), + pl.lit(f"/{_SUB_DIR}/"), + pl.col("file_name"), + ) + ) + .collect() + ) + return df.select(*sorted(df.columns)) + + def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: + """ + Use known tags to discover and update missing trees metadata. + + Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. + """ + rate_limit = self.rate_limit(strict=True) + fp = self._paths["trees"] + trees = pl.read_parquet(fp) + missing_trees = gh_tags.join( + trees.select(pl.col("tag").unique()), on="tag", how="anti" + ) + if missing_trees.is_empty(): + print(f"Already up-to-date {fp!s}") + return trees + else: + stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT + it = islice(missing_trees.iter_rows(named=True), stop) + missing = cast("Iterator[ReParsedTag]", it) + fresh_rows = self._trees_batched(missing) + print( + f"Finished collection.\n" + f"Writing {fresh_rows.height} new rows to {fp!s}" + ) + return pl.concat((trees, fresh_rows)).pipe(semver.sort) + + def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: + limit = self.rate_limit(strict=True) + npm_tag_only = npm_tags.lazy().select("tag") + fp = self._paths["tags"] + if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: + return ( + pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect() + ) + elif not fp.exists(): + print(f"Initializing {fp!s}") + tags = ( + self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect() + ) + print(f"Collected {tags.height} new tags") + return tags + else: + print("Checking for new tags") + prev = pl.scan_parquet(fp) + latest = ( + self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() + ) + if latest.equals(prev.pipe(semver.sort).head(1).collect()): + print(f"Already up-to-date {fp!s}") + return prev.collect() + print(f"Refreshing {fp!s}") + prev_eager = prev.collect() + tags = ( + pl.concat((self.tags(), prev_eager), how="vertical") + .unique("sha") + .pipe(semver.sort) + ) + print(f"Collected {tags.height - prev_eager.height} new tags") + return tags + + def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: + rate_limit = self.rate_limit(strict=True) + if not isinstance(tags, Sequence): + tags = tuple(tags) + req = self.req + n = len(tags) + cost = req._TREES_COST * n + if rate_limit["remaining"] < cost: + raise NotImplementedError(rate_limit, cost) + delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY + print( + f"Collecting metadata for {n} missing releases.\n" + f"Using {delay_secs=} between requests ..." + ) + dfs: list[pl.DataFrame] = [] + for tag in tags: + time.sleep(delay_secs + random.triangular()) + dfs.append(self.trees(tag)) + return pl.concat(dfs) diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py new file mode 100644 index 000000000..bdc20f83b --- /dev/null +++ b/tools/datasets/npm.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import json +import urllib.request +from typing import TYPE_CHECKING, Literal + +import polars as pl + +from tools.datasets import semver +from tools.datasets.models import NpmUrl + +if TYPE_CHECKING: + import sys + from pathlib import Path + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from tools.datasets.models import NpmPackageMetadataResponse + +__all__ = ["Npm"] + + +class Npm: + """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" + + def __init__( + self, + output_dir: Path, + name_tags: str, + *, + jsdelivr: Literal["jsdelivr"] = "jsdelivr", + npm: Literal["npm"] = "npm", + package: LiteralString = "vega-datasets", + jsdelivr_version: LiteralString = "v1", + ) -> None: + output_dir.mkdir(exist_ok=True) + self._paths: dict[Literal["tags"], Path] = { + "tags": output_dir / f"{name_tags}.parquet" + } + self._url: NpmUrl = NpmUrl( + CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", + TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", + ) + + @property + def url(self) -> NpmUrl: + return self._url + + def tags(self) -> pl.DataFrame: + """ + Request, parse tags from `Get package metadata`_. + + Notes + ----- + - Ignores canary releases + - ``npm`` can accept either, but this endpoint returns without "v": + + {tag} + v{tag} + + .. _Get package metadata: + https://www.jsdelivr.com/docs/data.jsdelivr.com#get-/v1/packages/npm/-package- + """ + req = urllib.request.Request( + self.url.TAGS, headers={"Accept": "application/json"} + ) + with urllib.request.urlopen(req) as response: + content: NpmPackageMetadataResponse = json.load(response) + versions = [ + f"v{tag}" + for v in content["versions"] + if (tag := v["version"]) and semver.CANARY not in tag + ] + return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) From b89e6dc31691cdeb2c33811c27db92c70ded7940 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:21:28 +0000 Subject: [PATCH 035/137] refactor: Move `DataLoader.__call__` -> `DataLoader.url()` -`data.name()` -> `data(name)` - `data.name.url` -> `data.url(name)` --- tools/datasets/__init__.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index bcbe725a1..c9114aa01 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -284,20 +284,13 @@ def __getattr__(self, name: str) -> Dataset: def __dir__(self) -> list[str]: return self.list_datasets() - def __call__( + def url( self, name: str, ext: ExtSupported | None = None, /, tag: LiteralString | Literal["latest"] | None = None, - ) -> WorkInProgress: - """ - **WIP** Will be using this *instead of* attribute access. - - - Original supports this as well - - Will only be using the actual (js_name) - - Some have hyphens, others underscores - """ + ) -> str: constraints: dict[Literal["tag", "suffix"], str] = {} if tag == "latest": raise NotImplementedError(tag) @@ -318,5 +311,21 @@ def __call__( q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] return app.github.query.url_from(**q) + def __call__( + self, + name: str, + ext: ExtSupported | None = None, + /, + tag: LiteralString | Literal["latest"] | None = None, + ) -> WorkInProgress: + """ + **WIP** Will be using this *instead of* attribute access. + + - Original supports this as well + - Will only be using the actual (js_name) + - Some have hyphens, others underscores + """ + return self.url(name, ext, tag=tag) + data = DataLoader() From 7b0fe294fabe3a562cf7d291951f1bd0da3e2b93 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 17:53:59 +0000 Subject: [PATCH 036/137] feat(typing): Generate annotations based on known datasets --- tools/datasets/__init__.py | 62 +++++++++++++++++ tools/datasets/_typing.py | 137 +++++++++++++++++++++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 tools/datasets/_typing.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index c9114aa01..bf5b7f187 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -16,9 +16,11 @@ import polars as pl +from tools.codemod import ruff from tools.datasets.github import GitHub from tools.datasets.models import QueryTree from tools.datasets.npm import Npm +from tools.schemapi import utils if TYPE_CHECKING: import sys @@ -37,10 +39,17 @@ else: from typing_extensions import TypeAlias + _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] + WorkInProgress: TypeAlias = Any __all__ = ["app", "data"] +HEADER_COMMENT = """\ +# The contents of this file are automatically written by +# tools/datasets.__init__.py. Do not modify directly. +""" + class Application: """ @@ -78,6 +87,14 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm + @property + def _aliases(self) -> dict[_PathAlias, Path]: + return { + "npm_tags": self.npm._paths["tags"], + "gh_tags": self.github._paths["tags"], + "gh_trees": self.github._paths["trees"], + } + def refresh(self) -> pl.DataFrame: npm_tags = self.npm.tags() self.write_parquet(npm_tags, self.npm._paths["tags"]) @@ -89,6 +106,21 @@ def refresh(self) -> pl.DataFrame: self.write_parquet(gh_trees, self.github._paths["trees"]) return gh_trees + def read(self, name: _PathAlias, /) -> pl.DataFrame: + """Read existing metadata from file.""" + return pl.read_parquet(self._from_alias(name)) + + def scan(self, name: _PathAlias, /) -> pl.LazyFrame: + """Scan existing metadata from file.""" + return pl.scan_parquet(self._from_alias(name)) + + def _from_alias(self, name: _PathAlias, /) -> Path: + if name not in {"npm_tags", "gh_tags", "gh_trees"}: + msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}' + raise TypeError(msg) + else: + return self._aliases[name] + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" if not fp.exists(): @@ -118,6 +150,36 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None """ +def generate_datasets_typing(application: Application, output: Path, /) -> None: + app = application + tags = app.scan("gh_tags").select("tag").collect().to_series() + names = ( + app.scan("gh_trees") + .filter("ext_supported") + .unique("name_js") + .select("name_js") + .sort("name_js") + .collect() + .to_series() + ) + NAME = "DatasetName" + TAG = "VersionTag" + EXT = "Extension" + contents = ( + f"{HEADER_COMMENT}", + "from __future__ import annotations\n", + "import sys", + "from typing import Literal, TYPE_CHECKING", + utils.import_typing_extensions((3, 10), "TypeAlias"), + "\n", + f"__all__ = {[NAME, TAG, EXT]}\n\n" + f"{NAME}: TypeAlias = {utils.spell_literal(names)}", + f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", + f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', + ) + ruff.write_lint_format(output, contents) + + def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: return suffix in {".csv", ".json", ".tsv", ".arrow"} diff --git a/tools/datasets/_typing.py b/tools/datasets/_typing.py new file mode 100644 index 000000000..9414aaab4 --- /dev/null +++ b/tools/datasets/_typing.py @@ -0,0 +1,137 @@ +# The contents of this file are automatically written by +# tools/datasets.__init__.py. Do not modify directly. + +from __future__ import annotations + +import sys +from typing import Literal + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + + +__all__ = ["DatasetName", "Extension", "VersionTag"] + +DatasetName: TypeAlias = Literal[ + "airports", + "annual-precip", + "anscombe", + "barley", + "birdstrikes", + "budget", + "budgets", + "burtin", + "cars", + "climate", + "co2-concentration", + "countries", + "crimea", + "disasters", + "driving", + "earthquakes", + "flare", + "flare-dependencies", + "flights-10k", + "flights-200k", + "flights-20k", + "flights-2k", + "flights-3m", + "flights-5k", + "flights-airport", + "football", + "gapminder", + "gapminder-health-income", + "github", + "global-temp", + "graticule", + "income", + "iowa-electricity", + "iris", + "jobs", + "la-riots", + "londonBoroughs", + "londonCentroids", + "londonTubeLines", + "lookup_groups", + "lookup_people", + "miserables", + "monarchs", + "movies", + "normal-2d", + "obesity", + "ohlc", + "penguins", + "platformer-terrain", + "points", + "political-contributions", + "population", + "population_engineers_hurricanes", + "seattle-temps", + "seattle-weather", + "seattle-weather-hourly-normals", + "sf-temps", + "sp500", + "sp500-2000", + "stocks", + "udistrict", + "unemployment", + "unemployment-across-industries", + "uniform-2d", + "us-10m", + "us-employment", + "us-state-capitals", + "volcano", + "weather", + "weball26", + "wheat", + "windvectors", + "world-110m", + "zipcodes", +] +VersionTag: TypeAlias = Literal[ + "v2.9.0", + "v2.8.1", + "v2.8.0", + "v2.7.0", + "v2.5.4", + "v2.5.3", + "v2.5.3-next.0", + "v2.5.2", + "v2.5.2-next.0", + "v2.5.1", + "v2.5.1-next.0", + "v2.5.0", + "v2.5.0-next.0", + "v2.4.0", + "v2.3.1", + "v2.3.0", + "v2.1.0", + "v2.0.0", + "v1.31.1", + "v1.31.0", + "v1.30.4", + "v1.30.3", + "v1.30.2", + "v1.30.1", + "v1.29.0", + "v1.24.0", + "v1.22.0", + "v1.21.1", + "v1.21.0", + "v1.20.0", + "v1.19.0", + "v1.18.0", + "v1.17.0", + "v1.16.0", + "v1.15.0", + "v1.14.0", + "v1.12.0", + "v1.11.0", + "v1.10.0", + "v1.8.0", + "v1.7.0", + "v1.5.0", +] +Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] From 572d069842ea80c085db22cf90aee7286e5a4bfd Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 18:02:42 +0000 Subject: [PATCH 037/137] refactor(typing): Utilize `datasets._typing` --- tools/datasets/__init__.py | 28 ++++++++++++---------------- tools/datasets/github.py | 4 ++-- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index bf5b7f187..a92aeb2fc 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -11,7 +11,7 @@ import tempfile from functools import cached_property, partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, get_args +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal from urllib.request import urlopen import polars as pl @@ -38,6 +38,7 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias + from tools.datasets._typing import DatasetName, Extension, VersionTag _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] @@ -144,11 +145,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" -ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] -""" -- `'flights-200k.(arrow|json)'` key collison using stem -""" - def generate_datasets_typing(application: Application, output: Path, /) -> None: app = application @@ -180,7 +176,7 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: ruff.write_lint_format(output, contents) -def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: +def is_ext_supported(suffix: str) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} @@ -193,7 +189,7 @@ def _js_to_py(s: str, /): class Dataset: - read_fn: ClassVar[dict[ExtSupported, Callable[..., pl.DataFrame]]] = { + read_fn: ClassVar[dict[Extension, Callable[..., pl.DataFrame]]] = { ".csv": pl.read_csv, ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t"), @@ -205,7 +201,7 @@ def __init__(self, name: str, /, base_url: str) -> None: file_name = DATASETS_JSON[_py_to_js(name)]["filename"] suffix = Path(file_name).suffix if is_ext_supported(suffix): - self.extension: ExtSupported = suffix + self.extension: Extension = suffix else: raise NotImplementedError(suffix, file_name) @@ -348,17 +344,17 @@ def __dir__(self) -> list[str]: def url( self, - name: str, - ext: ExtSupported | None = None, + name: DatasetName | LiteralString, + ext: Extension | None = None, /, - tag: LiteralString | Literal["latest"] | None = None, + tag: VersionTag | Literal["latest"] | None = None, ) -> str: constraints: dict[Literal["tag", "suffix"], str] = {} if tag == "latest": raise NotImplementedError(tag) elif tag is not None: constraints["tag"] = tag - if name.endswith(get_args(ExtSupported)): + if name.endswith((".csv", ".json", ".tsv", ".arrow")): name, suffix = name.rsplit(".", maxsplit=1) suffix = "." + suffix if not is_ext_supported(suffix): @@ -375,10 +371,10 @@ def url( def __call__( self, - name: str, - ext: ExtSupported | None = None, + name: DatasetName | LiteralString, + ext: Extension | None = None, /, - tag: LiteralString | Literal["latest"] | None = None, + tag: VersionTag | Literal["latest"] | None = None, ) -> WorkInProgress: """ **WIP** Will be using this *instead of* attribute access. diff --git a/tools/datasets/github.py b/tools/datasets/github.py index e245b91b1..fc0a899f2 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -31,7 +31,7 @@ from email.message import Message from urllib.request import OpenerDirector, Request - from tools.datasets import ExtSupported + from tools.datasets._typing import Extension from tools.datasets.models import ReParsedTag from tools.schemapi.utils import OneOrSeq @@ -62,7 +62,7 @@ _SUB_DIR = "data" -def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: +def is_ext_supported(suffix: str) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} From 07dcc0baaf955d10c65b68c65165c86bc2cb9ddb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 20:16:07 +0000 Subject: [PATCH 038/137] feat: Adds `Npm.dataset` for remote reading] --- tools/datasets/__init__.py | 5 ++-- tools/datasets/npm.py | 55 +++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index a92aeb2fc..b1a5b8550 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -375,7 +375,8 @@ def __call__( ext: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, - ) -> WorkInProgress: + **kwds: Any, + ) -> pl.DataFrame: """ **WIP** Will be using this *instead of* attribute access. @@ -383,7 +384,7 @@ def __call__( - Will only be using the actual (js_name) - Some have hyphens, others underscores """ - return self.url(name, ext, tag=tag) + return app.npm.dataset(self.url(name, ext, tag=tag), **kwds) data = DataLoader() diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index bdc20f83b..589db4660 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -2,7 +2,9 @@ import json import urllib.request -from typing import TYPE_CHECKING, Literal +from functools import partial +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal import polars as pl @@ -11,20 +13,43 @@ if TYPE_CHECKING: import sys - from pathlib import Path + from urllib.request import OpenerDirector + if sys.version_info >= (3, 13): + from typing import TypeIs + else: + from typing_extensions import TypeIs if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + from tools.datasets._typing import Extension from tools.datasets.models import NpmPackageMetadataResponse + ReadFn: TypeAlias = Callable[..., pl.DataFrame] + __all__ = ["Npm"] +def is_ext_supported(suffix: str) -> TypeIs[Extension]: + return suffix in {".csv", ".json", ".tsv", ".arrow"} + + class Npm: """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" + _read_fn: ClassVar[dict[Extension, ReadFn]] = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), + } + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + def __init__( self, output_dir: Path, @@ -48,6 +73,30 @@ def __init__( def url(self) -> NpmUrl: return self._url + @classmethod + def reader_from(cls, url: str, /) -> ReadFn: + suffix = Path(url).suffix + if is_ext_supported(suffix): + return cls._read_fn[suffix] + else: + msg = f"Unexpected file extension {suffix!r}, from:\n{url}" + raise NotImplementedError(msg) + + def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: + """ + Fetch a remote dataset. + + Parameters + ---------- + url + Full path to a known dataset. + **kwds + Arguments passed to the underlying read function. + """ + fn = self.reader_from(url) + with self._opener.open(url) as f: + return fn(f.read(), **kwds) + def tags(self) -> pl.DataFrame: """ Request, parse tags from `Get package metadata`_. @@ -66,7 +115,7 @@ def tags(self) -> pl.DataFrame: req = urllib.request.Request( self.url.TAGS, headers={"Accept": "application/json"} ) - with urllib.request.urlopen(req) as response: + with self._opener.open(req) as response: content: NpmPackageMetadataResponse = json.load(response) versions = [ f"v{tag}" From d8f37918b130d7f89defcb6f1104268db1997420 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 6 Nov 2024 20:24:38 +0000 Subject: [PATCH 039/137] refactor: Remove dead code --- tools/datasets/__init__.py | 173 ++----------------------------------- 1 file changed, 6 insertions(+), 167 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index b1a5b8550..ab1af8d4b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -8,11 +8,8 @@ from __future__ import annotations import json -import tempfile -from functools import cached_property, partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal -from urllib.request import urlopen +from typing import TYPE_CHECKING, Any, Literal import polars as pl @@ -180,167 +177,9 @@ def is_ext_supported(suffix: str) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} -def _py_to_js(s: str, /): - return s.replace("_", "-") - - -def _js_to_py(s: str, /): - return s.replace("-", "_") - - -class Dataset: - read_fn: ClassVar[dict[Extension, Callable[..., pl.DataFrame]]] = { - ".csv": pl.read_csv, - ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - } - - def __init__(self, name: str, /, base_url: str) -> None: - self.name: str = name - file_name = DATASETS_JSON[_py_to_js(name)]["filename"] - suffix = Path(file_name).suffix - if is_ext_supported(suffix): - self.extension: Extension = suffix - else: - raise NotImplementedError(suffix, file_name) - - self.url: str = f"{base_url}{file_name}" - - def __call__(self, **kwds: Any) -> pl.DataFrame: - fn = self.read_fn[self.extension] - with tempfile.NamedTemporaryFile() as tmp, urlopen(self.url) as f: - tmp.write(f.read()) - content = fn(tmp, **kwds) - return content - - def __repr__(self) -> str: - return ( - f"{type(self).__name__}(\n " - f"name={self.name!r},\n " - f"url={self.url!r}\n" - ")" - ) - - -DATASETS_JSON = { - # "7zip": {"filename": "7zip.png", "format": "png"}, - "airports": {"filename": "airports.csv", "format": "csv"}, - "annual-precip": {"filename": "annual-precip.json", "format": "json"}, - "anscombe": {"filename": "anscombe.json", "format": "json"}, - "barley": {"filename": "barley.json", "format": "json"}, - "birdstrikes": {"filename": "birdstrikes.json", "format": "json"}, - "budget": {"filename": "budget.json", "format": "json"}, - "budgets": {"filename": "budgets.json", "format": "json"}, - "burtin": {"filename": "burtin.json", "format": "json"}, - "cars": {"filename": "cars.json", "format": "json"}, - "climate": {"filename": "climate.json", "format": "json"}, - "co2-concentration": {"filename": "co2-concentration.csv", "format": "csv"}, - "countries": {"filename": "countries.json", "format": "json"}, - "crimea": {"filename": "crimea.json", "format": "json"}, - "disasters": {"filename": "disasters.csv", "format": "csv"}, - "driving": {"filename": "driving.json", "format": "json"}, - "earthquakes": {"filename": "earthquakes.json", "format": "json"}, - # "ffox": {"filename": "ffox.png", "format": "png"}, - "flare": {"filename": "flare.json", "format": "json"}, - "flare-dependencies": {"filename": "flare-dependencies.json", "format": "json"}, - "flights-10k": {"filename": "flights-10k.json", "format": "json"}, - "flights-200k": {"filename": "flights-200k.json", "format": "json"}, - "flights-20k": {"filename": "flights-20k.json", "format": "json"}, - "flights-2k": {"filename": "flights-2k.json", "format": "json"}, - "flights-3m": {"filename": "flights-3m.csv", "format": "csv"}, - "flights-5k": {"filename": "flights-5k.json", "format": "json"}, - "flights-airport": {"filename": "flights-airport.csv", "format": "csv"}, - "gapminder": {"filename": "gapminder.json", "format": "json"}, - "gapminder-health-income": { - "filename": "gapminder-health-income.csv", - "format": "csv", - }, - # "gimp": {"filename": "gimp.png", "format": "png"}, - "github": {"filename": "github.csv", "format": "csv"}, - "graticule": {"filename": "graticule.json", "format": "json"}, - "income": {"filename": "income.json", "format": "json"}, - "iowa-electricity": {"filename": "iowa-electricity.csv", "format": "csv"}, - "iris": {"filename": "iris.json", "format": "json"}, - "jobs": {"filename": "jobs.json", "format": "json"}, - "la-riots": {"filename": "la-riots.csv", "format": "csv"}, - "londonBoroughs": {"filename": "londonBoroughs.json", "format": "json"}, - "londonCentroids": {"filename": "londonCentroids.json", "format": "json"}, - "londonTubeLines": {"filename": "londonTubeLines.json", "format": "json"}, - "lookup_groups": {"filename": "lookup_groups.csv", "format": "csv"}, - "lookup_people": {"filename": "lookup_people.csv", "format": "csv"}, - "miserables": {"filename": "miserables.json", "format": "json"}, - "monarchs": {"filename": "monarchs.json", "format": "json"}, - "movies": {"filename": "movies.json", "format": "json"}, - "normal-2d": {"filename": "normal-2d.json", "format": "json"}, - "obesity": {"filename": "obesity.json", "format": "json"}, - "ohlc": {"filename": "ohlc.json", "format": "json"}, - "points": {"filename": "points.json", "format": "json"}, - "population": {"filename": "population.json", "format": "json"}, - "population_engineers_hurricanes": { - "filename": "population_engineers_hurricanes.csv", - "format": "csv", - }, - "seattle-temps": {"filename": "seattle-temps.csv", "format": "csv"}, - "seattle-weather": {"filename": "seattle-weather.csv", "format": "csv"}, - "sf-temps": {"filename": "sf-temps.csv", "format": "csv"}, - "sp500": {"filename": "sp500.csv", "format": "csv"}, - "stocks": {"filename": "stocks.csv", "format": "csv"}, - "udistrict": {"filename": "udistrict.json", "format": "json"}, - "unemployment": {"filename": "unemployment.tsv", "format": "tsv"}, - "unemployment-across-industries": { - "filename": "unemployment-across-industries.json", - "format": "json", - }, - "uniform-2d": {"filename": "uniform-2d.json", "format": "json"}, - "us-10m": {"filename": "us-10m.json", "format": "json"}, - "us-employment": {"filename": "us-employment.csv", "format": "csv"}, - "us-state-capitals": {"filename": "us-state-capitals.json", "format": "json"}, - "volcano": {"filename": "volcano.json", "format": "json"}, - "weather": {"filename": "weather.json", "format": "json"}, - "weball26": {"filename": "weball26.json", "format": "json"}, - "wheat": {"filename": "wheat.json", "format": "json"}, - "windvectors": {"filename": "windvectors.csv", "format": "csv"}, - "world-110m": {"filename": "world-110m.json", "format": "json"}, - "zipcodes": {"filename": "zipcodes.csv", "format": "csv"}, -} -"""Inlined `datasets.json`_. - -- Excluding images - -.. _datasets.json: - https://github.com/altair-viz/vega_datasets/blob/136e850447b49031f04baa137ce5c37a6678bbb1/vega_datasets/datasets.json -""" - - class DataLoader: - source_tag: ClassVar[str] = "v2.9.0" - _base_url_fmt: str = "https://cdn.jsdelivr.net/npm/vega-datasets@{0}/data/" - - @property - def base_url(self) -> str: - return self._base_url_fmt.format(self.source_tag) - - @cached_property - def _dataset_names(self) -> list[str]: - return sorted(DATASETS_JSON) - - @cached_property - def _py_js_names(self) -> dict[str, str]: - return {_js_to_py(name): name for name in self._dataset_names} - - def list_datasets(self) -> list[str]: - return list(self._py_js_names) - - def __getattr__(self, name: str) -> Dataset: - if name in self._py_js_names: - return Dataset(self._py_js_names[name], self.base_url) - else: - msg = f"No dataset named {name!r}" - raise AttributeError(msg) - - def __dir__(self) -> list[str]: - return self.list_datasets() + def __init__(self, application: Application, /) -> None: + self._app: Application = application def url( self, @@ -367,7 +206,7 @@ def url( else: constraints["suffix"] = ext q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] - return app.github.query.url_from(**q) + return self._app.github.query.url_from(**q) def __call__( self, @@ -384,7 +223,7 @@ def __call__( - Will only be using the actual (js_name) - Some have hyphens, others underscores """ - return app.npm.dataset(self.url(name, ext, tag=tag), **kwds) + return self._app.npm.dataset(self.url(name, ext, tag=tag), **kwds) -data = DataLoader() +data = DataLoader(app) From 4642a238971edea66b4bd5f5e3636a287de2db96 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:26:34 +0000 Subject: [PATCH 040/137] refactor: Replace `name_js`, `name_py` with `dataset_name` Since we're just using strings, there is no need for 2 forms of the name. The legacy package needed this for `__getattr__` access with valid identifiers --- tools/datasets/__init__.py | 9 +++++---- tools/datasets/_metadata/metadata-schema.json | 3 +-- tools/datasets/_metadata/metadata.parquet | Bin 20768 -> 19087 bytes tools/datasets/github.py | 5 ++--- tools/datasets/models.py | 6 ++---- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index ab1af8d4b..8217ab355 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -146,12 +146,13 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None def generate_datasets_typing(application: Application, output: Path, /) -> None: app = application tags = app.scan("gh_tags").select("tag").collect().to_series() + DATASET_NAME = "dataset_name" names = ( app.scan("gh_trees") .filter("ext_supported") - .unique("name_js") - .select("name_js") - .sort("name_js") + .unique(DATASET_NAME) + .select(DATASET_NAME) + .sort(DATASET_NAME) .collect() .to_series() ) @@ -205,7 +206,7 @@ def url( raise TypeError(ext) else: constraints["suffix"] = ext - q = QueryTree(name_js=name, **constraints) # type: ignore[typeddict-item] + q = QueryTree(dataset_name=name, **constraints) # type: ignore[typeddict-item] return self._app.github.query.url_from(**q) def __call__( diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json index 2b5b9d955..d3da3f86d 100644 --- a/tools/datasets/_metadata/metadata-schema.json +++ b/tools/datasets/_metadata/metadata-schema.json @@ -1,9 +1,8 @@ { + "dataset_name": "str", "ext_supported": "bool", "file_name": "str", "name_collision": "bool", - "name_js": "str", - "name_py": "str", "size": "int", "suffix": "str", "tag": "str", diff --git a/tools/datasets/_metadata/metadata.parquet b/tools/datasets/_metadata/metadata.parquet index 071e4bd6cf68fcc17952c5057858fa29399c9415..97f235546beb0c56abede1cb419eab4afb89dd9c 100644 GIT binary patch delta 1026 zcmY*YUr3Wt6u;lsHnmnR^8b=EMyZU$h`A zfmROA%D`J=t`dG@yEKL5`Eu|zSN?JNX#WC3(=VTHlHcJb>FApGa6Eq zGrwuvR9^%194BxfohY=+H@vN>CqIgW^atT1$IrB-A85QBKYEmGYenfjTEiCpi8tL@ zp4P}7OmOw2VfQ6+tw~N~%_U@^X)C$gWJ+&uzQ+we&x6ZK1OQNLQKlHC_^kiB3_jbO z^~X){Gne(J>)}*>)}OP(WqH=W>V_{?au9*rbX5lk0)UPfXXsn zpgmBnHK7c&X#pS}3nq^($Mw_VGo*z|bv00^)Qq~uT;DF04XGB+^H4x@e>hkJ#P@3L z*kq|LQ@q+S0MJi~u}B#^fjC)3{MH?NGaANF4xQ=^#zS2RLom@VsL*{0{7S(FAf}FxM>d@})5eaJ z8IprZL1FOs^~DEhnb#6Cb%uvVAb#>_$x?~>-zOBbK&pt0ON#Qxw;rJLh;hS%Bt8b@ZW$xoG?sM^yHryd}k6 zwyKYjJ@x~}ZcJA<4r5$}n=rP8%@~`V*hwwh+o^*at&HSih61F)(U@;xe#m1>c#PEp z0Vg@=z=f@Au~2WQB~s}Pla$@O5f4zn;)?hN7){^8Xu&FX%omHAFg}!EY7VntBQw}5 zS*MkF9XfrTgMt-OgbI@=HYS2nsXAK67_IS9jZ~DHcY0}#UXW@9V$DgPyY6(rIGFHz z+Is^=6S1~87O?x8WB!i8C{-Q_bR1ik+RkeFJdp}dReNR7V(t(2xS~EBUQVU3F#n`@5T3{Awh1N1REd`OEQlui%ZB&u? z6QkH&WGMci5F`ugW>uz9r*rWGigV@t3stg%oN6PLw) zmPN0Nt}o`=d!Fud7Isks7oQGgZJ3diKPswu`25L|`B#0^;&ab#JnB5#(z4SO`Rv0A zxuE^iEsUlMU5Uo{*4wM5xUO{!an}bgJrRCR$dPqqi~GJdtW`TRyIyP#@^*>0G)-%` zb!4rypk<+Dq;GJ?5$V2&lN{w)mCbiXYhh2)opg*?yvr`SK4cqdRvtTXN5VNjv;Cu? zZiY9x@y~&F+4O*}*5-&6GFKhghWXhW*Jh6T?KW~*mrmFJ@;m+O#-{Hhip1vr`&a)y zc)xa7Tm86Zl%FIz96CAS(6TFgr;jYNQidBF!a{X{t(`k>$Hq*sYwr%q&4;dA_r)nM zhIFnY`~z=g7Yo<*`S)bpY#2N|swt^Nr(fV*oH%ZB4g?u+Vo$O2eM&Kb=q+* z4dFm=BzmFJ*S{I|v>yv%kfhxWBlQEoCiv$hQp2mj;YrIsRs}Zp?-#F8v|Z1fEAb~ z4f}tiS(GV>E;qMkHvxMKqw2p2gWoJZo9f;~1ffRba%in~fqstFv^?Kz_ZGtLQYE&YHi_@-$NMa_G zWJ69Q1&Ki`B^mHiCq-pJOyYJRU5Eh{LJb8-EKHVBmT%66#w$&#gyp$<-7-^Z@mMyb z)8jLis>iZbA?It^R4`i ParsedTree: path = Path(tree["path"]) return ParsedTree( file_name=path.name, - name_js=path.stem, - name_py=path.stem.replace("-", "_"), + dataset_name=path.stem, suffix=path.suffix, size=tree["size"], url=tree["url"], @@ -361,7 +360,7 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: pl.DataFrame(parsed) .lazy() .rename({"url": "url_github"}) - .with_columns(name_collision=pl.col("name_py").is_duplicated()) + .with_columns(name_collision=pl.col("dataset_name").is_duplicated()) .with_columns( url_npm=pl.concat_str( pl.lit(_NPM_BASE_URL), diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 5a6598fed..0271d09de 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -112,8 +112,7 @@ class NpmPackageMetadataResponse(TypedDict): class ParsedTree(TypedDict): file_name: str - name_js: str - name_py: str + dataset_name: str suffix: str size: int url: str @@ -123,8 +122,7 @@ class ParsedTree(TypedDict): class QueryTree(TypedDict, total=False): file_name: str - name_js: Required[str] - name_py: str + dataset_name: Required[str] suffix: str size: int url: str From 65f87fc2e99b49b781844993a6e45489ed648a65 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:28:20 +0000 Subject: [PATCH 041/137] fix: Remove invalid `semver.sort` op I think this was added in error, since the schema of the file never had `semver` columns Only noticed the bug when doing a full rebuild --- tools/datasets/github.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 33d7289af..9b6671646 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -397,7 +397,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: f"Finished collection.\n" f"Writing {fresh_rows.height} new rows to {fp!s}" ) - return pl.concat((trees, fresh_rows)).pipe(semver.sort) + return pl.concat((trees, fresh_rows)) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) From 6349b0f255fab9df3173b5b75c660056317dfe82 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:08:04 +0000 Subject: [PATCH 042/137] fix: Add missing init path for `refresh_trees` --- tools/datasets/github.py | 81 ++++++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 9b6671646..cb9d74751 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -3,13 +3,14 @@ import json import os import random +import sys import time import urllib.request import warnings -from collections.abc import Iterable, Iterator, Sequence +from collections.abc import Iterable, Iterator, Mapping, Sequence from itertools import islice from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, cast +from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast import polars as pl @@ -23,16 +24,20 @@ ParsedRateLimit, ParsedTag, ParsedTree, + ReParsedTag, ) +if sys.version_info >= (3, 13): + from typing import is_typeddict +else: + from typing_extensions import is_typeddict + if TYPE_CHECKING: - import sys from collections.abc import MutableMapping from email.message import Message from urllib.request import OpenerDirector, Request from tools.datasets._typing import Extension - from tools.datasets.models import ReParsedTag from tools.schemapi.utils import OneOrSeq if sys.version_info >= (3, 13): @@ -50,8 +55,11 @@ _PathName: TypeAlias = Literal["dir", "tags", "trees"] + __all__ = ["GitHub"] +_TD = TypeVar("_TD", bound=Mapping[str, Any]) + _ItemSlice: TypeAlias = ( "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" ) @@ -379,25 +387,27 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. """ + if gh_tags.is_empty(): + msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}" + raise NotImplementedError(msg) rate_limit = self.rate_limit(strict=True) + stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT fp = self._paths["trees"] - trees = pl.read_parquet(fp) - missing_trees = gh_tags.join( - trees.select(pl.col("tag").unique()), on="tag", how="anti" - ) - if missing_trees.is_empty(): - print(f"Already up-to-date {fp!s}") - return trees + TP = ReParsedTag + if not fp.exists(): + print(f"Initializing {fp!s}") + return self._trees_batched(_iter_rows(gh_tags, stop, TP)) else: - stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT - it = islice(missing_trees.iter_rows(named=True), stop) - missing = cast("Iterator[ReParsedTag]", it) - fresh_rows = self._trees_batched(missing) - print( - f"Finished collection.\n" - f"Writing {fresh_rows.height} new rows to {fp!s}" + trees = pl.read_parquet(fp) + missing_trees = gh_tags.join( + trees.select(pl.col("tag").unique()), on="tag", how="anti" ) - return pl.concat((trees, fresh_rows)) + if missing_trees.is_empty(): + print(f"Already up-to-date {fp!s}") + return trees + else: + fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) + return pl.concat((trees, fresh)) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) @@ -451,4 +461,35 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: for tag in tags: time.sleep(delay_secs + random.triangular()) dfs.append(self.trees(tag)) - return pl.concat(dfs) + df = pl.concat(dfs) + print(f"Finished collection.\n" f"Found {df.height} new rows") + return df + + +def _iter_rows(df: pl.DataFrame, stop: int | None, /, tp: type[_TD]) -> Iterator[_TD]: + """ + Wraps `pl.DataFrame.iter_rows`_ with typing to preserve key completions. + + Parameters + ---------- + df + Target dataframe. + stop + Passed to `itertools.islice`_. + tp + Static type representing a row/record. + + .. note:: + Performs a **very basic** runtime check on the type of ``tp`` (*not* ``df``). + + Primarily used to override ``dict[str, Any]`` when a *narrower* type is known. + + .. _itertools.islice: + https://docs.python.org/3/library/itertools.html#itertools.islice + .. _pl.DataFrame.iter_rows: + https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_rows.html + """ + if not TYPE_CHECKING: + assert is_typeddict(tp) or issubclass(tp, Mapping) + + return cast(Iterator[_TD], islice(df.iter_rows(named=True), stop)) From f1d610c528e81c12381114b2fafea13d53267bab Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:41:54 +0000 Subject: [PATCH 043/137] refactor: Move public interface to `_io` Temporary home, see module docstring --- tools/datasets/__init__.py | 47 ++-------- tools/datasets/_io.py | 178 +++++++++++++++++++++++++++++++++++++ tools/datasets/github.py | 42 --------- tools/datasets/models.py | 10 --- tools/datasets/npm.py | 49 +--------- 5 files changed, 188 insertions(+), 138 deletions(-) create mode 100644 tools/datasets/_io.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 8217ab355..3adc2321b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -14,8 +14,8 @@ import polars as pl from tools.codemod import ruff +from tools.datasets._io import Reader from tools.datasets.github import GitHub -from tools.datasets.models import QueryTree from tools.datasets.npm import Npm from tools.schemapi import utils @@ -23,10 +23,6 @@ import sys from collections.abc import Mapping - if sys.version_info >= (3, 13): - from typing import TypeIs - else: - from typing_extensions import TypeIs if sys.version_info >= (3, 11): from typing import LiteralString else: @@ -174,13 +170,9 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: ruff.write_lint_format(output, contents) -def is_ext_supported(suffix: str) -> TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow"} - - class DataLoader: - def __init__(self, application: Application, /) -> None: - self._app: Application = application + def __init__(self, metadata: Path, /) -> None: + self._reader = Reader(metadata) def url( self, @@ -189,25 +181,8 @@ def url( /, tag: VersionTag | Literal["latest"] | None = None, ) -> str: - constraints: dict[Literal["tag", "suffix"], str] = {} - if tag == "latest": - raise NotImplementedError(tag) - elif tag is not None: - constraints["tag"] = tag - if name.endswith((".csv", ".json", ".tsv", ".arrow")): - name, suffix = name.rsplit(".", maxsplit=1) - suffix = "." + suffix - if not is_ext_supported(suffix): - raise TypeError(suffix) - else: - constraints["suffix"] = suffix - elif ext is not None: - if not is_ext_supported(ext): - raise TypeError(ext) - else: - constraints["suffix"] = ext - q = QueryTree(dataset_name=name, **constraints) # type: ignore[typeddict-item] - return self._app.github.query.url_from(**q) + """Return the address of a remote dataset.""" + return self._reader.url(name, ext, tag=tag) def __call__( self, @@ -217,14 +192,8 @@ def __call__( tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, ) -> pl.DataFrame: - """ - **WIP** Will be using this *instead of* attribute access. - - - Original supports this as well - - Will only be using the actual (js_name) - - Some have hyphens, others underscores - """ - return self._app.npm.dataset(self.url(name, ext, tag=tag), **kwds) + """Get a remote dataset and load as tabular data.""" + return self._reader.dataset(self.url(name, ext, tag=tag), **kwds) -data = DataLoader(app) +data = DataLoader(app._from_alias("gh_trees")) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py new file mode 100644 index 000000000..4a6dce431 --- /dev/null +++ b/tools/datasets/_io.py @@ -0,0 +1,178 @@ +""" +Will be part of the public ``alt.datasets`` subpackage. + +- Interfacing with the cached metadata. + - But not updating it +- Performing requests from those urls +- Dispatching read function on file extension + +Note +---- +- Building with ``polars`` first, then will work backwards with ``narwhals``. + - Since ``narwhals`` is a subset of ``polars`` +""" + +from __future__ import annotations + +import urllib.request +from functools import partial +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar + +import polars as pl + +if TYPE_CHECKING: + import sys + from urllib.request import OpenerDirector + + from _typeshed import StrPath + + if sys.version_info >= (3, 13): + from typing import TypeIs + else: + from typing_extensions import TypeIs + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + from narwhals import typing as nw_typing # noqa: F401 + + from tools.datasets._typing import DatasetName, Extension, VersionTag + from tools.schemapi.utils import OneOrSeq + + _ExtensionScan: TypeAlias = Literal[".parquet"] + + ReadFn: TypeAlias = Callable[..., pl.DataFrame] + ScanFn: TypeAlias = Callable[..., pl.LazyFrame] + _T = TypeVar("_T") + +__all__ = ["Reader"] + +_ItemSlice: TypeAlias = ( + "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" +) +"""Query result scalar selection.""" + + +class Reader: + _read_fn: ClassVar[dict[Extension, ReadFn]] = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), + } + _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet} + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + + def __init__(self, fp_trees: Path, /) -> None: + self._fp_trees: Path = fp_trees + + @classmethod + def reader_from(cls, source: StrPath, /) -> ReadFn: + suffix = validate_suffix(source, is_ext_supported) + return cls._read_fn[suffix] + + @classmethod + def scanner_from(cls, source: StrPath, /) -> ScanFn: + suffix = validate_suffix(source, is_ext_scan) + return cls._scan_fn[suffix] + + def url( + self, + name: DatasetName | LiteralString, + ext: Extension | None = None, + /, + tag: VersionTag | Literal["latest"] | None = None, + ) -> str: + constraints: dict[str, str] = {} + if tag == "latest": + raise NotImplementedError(tag) + elif tag is not None: + constraints["tag"] = tag + # NOTE: Probably need to remove/move this + if name.endswith((".csv", ".json", ".tsv", ".arrow")): + name, suffix = name.rsplit(".", maxsplit=1) + suffix = "." + suffix + if not is_ext_supported(suffix): + raise TypeError(suffix) + else: + constraints["suffix"] = suffix + elif ext is not None: + if not is_ext_supported(ext): + raise TypeError(ext) + else: + constraints["suffix"] = ext + return self._url_from(item=(0, "url_npm"), dataset_name=name, **constraints) + + def _url_from( + self, + *predicates: OneOrSeq[str | pl.Expr], + item: _ItemSlice = (0, "url_npm"), + **constraints: Any, + ) -> str: + r""" + Querying multi-version trees metadata for `npm` url to fetch. + + Parameters + ---------- + \*predicates, \*\*constraints + Passed directly to `pl.LazyFrame.filter`_. + item + Scalar selection args for `pl.DataFrame.item`_. + + .. _pl.LazyFrame.filter: + https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html + .. _pl.DataFrame.item: + https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html + """ + source = self._fp_trees + fn = self.scanner_from(self._fp_trees) + results = fn(source).filter(*predicates, **constraints).collect() + if not results.is_empty(): + url = results.item(*item) + if isinstance(url, str): + return url + else: + msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." + raise TypeError(msg) + else: + terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) + msg = f"Found no results for:\n{terms}" + raise NotImplementedError(msg) + + def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: + """ + Fetch a remote dataset. + + Parameters + ---------- + url + Full path to a known dataset. + **kwds + Arguments passed to the underlying read function. + """ + fn = self.reader_from(url) + with self._opener.open(url) as f: + return fn(f.read(), **kwds) + + +def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: + suffix: Any = Path(source).suffix + if guard(suffix): + return suffix + else: + msg = f"Unexpected file extension {suffix!r}, from:\n{source}" + raise TypeError(msg) + + +def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: + return suffix == ".parquet" + + +def is_ext_supported(suffix: Any) -> TypeIs[Extension]: + return suffix in {".csv", ".json", ".tsv", ".arrow"} diff --git a/tools/datasets/github.py b/tools/datasets/github.py index cb9d74751..951221765 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -38,7 +38,6 @@ from urllib.request import OpenerDirector, Request from tools.datasets._typing import Extension - from tools.schemapi.utils import OneOrSeq if sys.version_info >= (3, 13): from typing import TypeIs @@ -60,10 +59,6 @@ _TD = TypeVar("_TD", bound=Mapping[str, Any]) -_ItemSlice: TypeAlias = ( - "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" -) -"""Query result scalar selection.""" # TODO: Work on where these should live/be accessed _NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" @@ -253,38 +248,6 @@ def tag_from_str(self, s: str, /) -> str: raise TypeError(s) -class _GitHubQueryNamespace: - """**WIP** Interfacing with the cached metadata.""" - - def __init__(self, gh: GitHub, /) -> None: - self._gh = gh - - @property - def paths(self) -> dict[_PathName, Path]: - return self._gh._paths - - def url_from( - self, - *predicates: OneOrSeq[str | pl.Expr], - item: _ItemSlice = (0, "url_npm"), - **constraints: Any, - ) -> str: - """Querying multi-version trees metadata for `npm` url to fetch.""" - fp = self.paths["trees"] - if fp.suffix != ".parquet": - raise NotImplementedError(fp.suffix) - items = pl.scan_parquet(fp).filter(*predicates, **constraints).collect() - if items.is_empty(): - msg = f"Found no results for:\n" f"{predicates!r}\n{constraints!r}" - raise NotImplementedError(msg) - r = items.item(*item) - if _is_str(r): - return r - else: - msg = f"Expected 'str' but got {type(r).__name__!r} from {r!r}." - raise TypeError(msg) - - class GitHub: """ Primary interface with the GitHub API. @@ -294,7 +257,6 @@ class GitHub: - Uses `tags`_, `trees`_, `rate_limit`_ endpoints. - Organizes distinct groups of operations into property accessor namespaces. - .. _tags: https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags .. _trees: @@ -339,10 +301,6 @@ def req(self) -> _GitHubRequestNamespace: def parse(self) -> _GitHubParseNamespace: return _GitHubParseNamespace(self) - @property - def query(self) -> _GitHubQueryNamespace: - return _GitHubQueryNamespace(self) - @property def url(self) -> GitHubUrl: return self._url diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 0271d09de..6ea7992ae 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -120,16 +120,6 @@ class ParsedTree(TypedDict): tag: str -class QueryTree(TypedDict, total=False): - file_name: str - dataset_name: Required[str] - suffix: str - size: int - url: str - ext_supported: bool - tag: str - - class ParsedTreesResponse(TypedDict): tag: str url: str diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index 589db4660..a5f068082 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -2,9 +2,7 @@ import json import urllib.request -from functools import partial -from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal +from typing import TYPE_CHECKING, ClassVar, Literal import polars as pl @@ -13,41 +11,22 @@ if TYPE_CHECKING: import sys + from pathlib import Path from urllib.request import OpenerDirector - if sys.version_info >= (3, 13): - from typing import TypeIs - else: - from typing_extensions import TypeIs if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - if sys.version_info >= (3, 10): - from typing import TypeAlias - else: - from typing_extensions import TypeAlias - from tools.datasets._typing import Extension from tools.datasets.models import NpmPackageMetadataResponse - ReadFn: TypeAlias = Callable[..., pl.DataFrame] __all__ = ["Npm"] -def is_ext_supported(suffix: str) -> TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow"} - - class Npm: """https://www.jsdelivr.com/docs/data.jsdelivr.com#overview.""" - _read_fn: ClassVar[dict[Extension, ReadFn]] = { - ".csv": pl.read_csv, - ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - } _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() def __init__( @@ -73,30 +52,6 @@ def __init__( def url(self) -> NpmUrl: return self._url - @classmethod - def reader_from(cls, url: str, /) -> ReadFn: - suffix = Path(url).suffix - if is_ext_supported(suffix): - return cls._read_fn[suffix] - else: - msg = f"Unexpected file extension {suffix!r}, from:\n{url}" - raise NotImplementedError(msg) - - def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: - """ - Fetch a remote dataset. - - Parameters - ---------- - url - Full path to a known dataset. - **kwds - Arguments passed to the underlying read function. - """ - fn = self.reader_from(url) - with self._opener.open(url) as f: - return fn(f.read(), **kwds) - def tags(self) -> pl.DataFrame: """ Request, parse tags from `Get package metadata`_. From c4ef112e0d21872807126c51a62cd144d535dccc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 15:43:16 +0000 Subject: [PATCH 044/137] refactor(perf): Don't recreate path mapping on every attribute access --- tools/datasets/__init__.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 3adc2321b..47575278c 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -8,6 +8,7 @@ from __future__ import annotations import json +import types from pathlib import Path from typing import TYPE_CHECKING, Any, Literal @@ -72,6 +73,13 @@ def __init__( output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh ) self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm) + self._paths = types.MappingProxyType["_PathAlias", Path]( + { + "npm_tags": self.npm._paths["tags"], + "gh_tags": self.github._paths["tags"], + "gh_trees": self.github._paths["trees"], + } + ) @property def github(self) -> GitHub: @@ -81,23 +89,15 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - @property - def _aliases(self) -> dict[_PathAlias, Path]: - return { - "npm_tags": self.npm._paths["tags"], - "gh_tags": self.github._paths["tags"], - "gh_trees": self.github._paths["trees"], - } - def refresh(self) -> pl.DataFrame: npm_tags = self.npm.tags() - self.write_parquet(npm_tags, self.npm._paths["tags"]) + self.write_parquet(npm_tags, self._paths["npm_tags"]) gh_tags = self.github.refresh_tags(npm_tags) - self.write_parquet(gh_tags, self.github._paths["tags"]) + self.write_parquet(gh_tags, self._paths["gh_tags"]) gh_trees = self.github.refresh_trees(gh_tags) - self.write_parquet(gh_trees, self.github._paths["trees"]) + self.write_parquet(gh_trees, self._paths["gh_trees"]) return gh_trees def read(self, name: _PathAlias, /) -> pl.DataFrame: @@ -113,7 +113,7 @@ def _from_alias(self, name: _PathAlias, /) -> Path: msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}' raise TypeError(msg) else: - return self._aliases[name] + return self._paths[name] def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" From eb876ebc945776b2f7524ad6e7774347dd7d45ac Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:58:30 +0000 Subject: [PATCH 045/137] refactor: Split `Reader._url_from` into `url`, `_query` - Much more generic now in what it can be used for - For the caching, I'll need more columns than just `"url_npm"` - `"url_github" contains a hash --- tools/datasets/_io.py | 89 ++++++++++++++++++++++------------------ tools/datasets/models.py | 14 +++++++ 2 files changed, 62 insertions(+), 41 deletions(-) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 4a6dce431..812a9eeb0 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -28,14 +28,13 @@ from _typeshed import StrPath if sys.version_info >= (3, 13): - from typing import TypeIs + from typing import TypeIs, Unpack else: - from typing_extensions import TypeIs + from typing_extensions import TypeIs, Unpack if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - if sys.version_info >= (3, 10): from typing import TypeAlias else: @@ -43,6 +42,7 @@ from narwhals import typing as nw_typing # noqa: F401 from tools.datasets._typing import DatasetName, Extension, VersionTag + from tools.datasets.models import Metadata from tools.schemapi.utils import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] @@ -56,7 +56,12 @@ _ItemSlice: TypeAlias = ( "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" ) -"""Query result scalar selection.""" +""" +Scalar selection args for `pl.DataFrame.item`_. + +.. _pl.DataFrame.item: + https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html +""" class Reader: @@ -89,57 +94,34 @@ def url( /, tag: VersionTag | Literal["latest"] | None = None, ) -> str: - constraints: dict[str, str] = {} - if tag == "latest": - raise NotImplementedError(tag) - elif tag is not None: - constraints["tag"] = tag - # NOTE: Probably need to remove/move this - if name.endswith((".csv", ".json", ".tsv", ".arrow")): - name, suffix = name.rsplit(".", maxsplit=1) - suffix = "." + suffix - if not is_ext_supported(suffix): - raise TypeError(suffix) - else: - constraints["suffix"] = suffix - elif ext is not None: - if not is_ext_supported(ext): - raise TypeError(ext) - else: - constraints["suffix"] = ext - return self._url_from(item=(0, "url_npm"), dataset_name=name, **constraints) - - def _url_from( - self, - *predicates: OneOrSeq[str | pl.Expr], - item: _ItemSlice = (0, "url_npm"), - **constraints: Any, - ) -> str: + df = self._query(**validate_constraints(name, ext, tag)) + item: _ItemSlice = (0, "url_npm") + url = df.item(*item) + if isinstance(url, str): + return url + else: + msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." + raise TypeError(msg) + + def _query( + self, *predicates: OneOrSeq[str | pl.Expr], **constraints: Unpack[Metadata] + ) -> pl.DataFrame: r""" - Querying multi-version trees metadata for `npm` url to fetch. + Query multi-version trees metadata. Parameters ---------- \*predicates, \*\*constraints Passed directly to `pl.LazyFrame.filter`_. - item - Scalar selection args for `pl.DataFrame.item`_. .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html - .. _pl.DataFrame.item: - https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html """ source = self._fp_trees fn = self.scanner_from(self._fp_trees) results = fn(source).filter(*predicates, **constraints).collect() if not results.is_empty(): - url = results.item(*item) - if isinstance(url, str): - return url - else: - msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." - raise TypeError(msg) + return results else: terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) msg = f"Found no results for:\n{terms}" @@ -161,6 +143,31 @@ def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: return fn(f.read(), **kwds) +def validate_constraints( + name: DatasetName | LiteralString, + ext: Extension | None, + tag: VersionTag | Literal["latest"] | None, + /, +) -> Metadata: + constraints: Metadata = {} + if tag == "latest": + raise NotImplementedError(tag) + elif tag is not None: + constraints["tag"] = tag + if name.endswith((".csv", ".json", ".tsv", ".arrow")): + fp = Path(name) + constraints["dataset_name"] = fp.stem + constraints["suffix"] = fp.suffix + return constraints + elif ext is not None: + if not is_ext_supported(ext): + raise TypeError(ext) + else: + constraints["suffix"] = ext + constraints["dataset_name"] = name + return constraints + + def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: suffix: Any = Path(source).suffix if guard(suffix): diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 6ea7992ae..fa0972035 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -126,6 +126,20 @@ class ParsedTreesResponse(TypedDict): tree: list[ParsedTree] +class Metadata(TypedDict, total=False): + """Full schema for `metadata.parquet`.""" + + dataset_name: str + ext_supported: bool + file_name: str + name_collision: bool + size: int + suffix: str + tag: str + url_github: str + url_npm: str + + class GitHubRateLimit(TypedDict): limit: int used: int From 661a3851034c39c1c8249a7426ae33821f802f14 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 17:01:41 +0000 Subject: [PATCH 046/137] feat(DRAFT): Adds `GitHubUrl.BLOBS` - Common prefix to all rows in `metadata[url_github]` - Stripping this leaves only `sha` - For **2800** rows, there are only **109** unique hashes, so these can be used to reduce cache size --- tools/datasets/github.py | 1 + tools/datasets/models.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 951221765..4f15140e3 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -287,6 +287,7 @@ def __init__( repo = f"{base_url}repos/{org}/{package}/" self._url = GitHubUrl( BASE=base_url, + BLOBS=f"{repo}git/blobs/", RATE=f"{base_url}rate_limit", REPO=repo, TAGS=f"{repo}tags", diff --git a/tools/datasets/models.py b/tools/datasets/models.py index fa0972035..2bca343aa 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -21,6 +21,7 @@ class GitHubUrl(NamedTuple): BASE: LiteralString + BLOBS: LiteralString RATE: LiteralString REPO: LiteralString TAGS: LiteralString From 22dcb17868246c0d79796e3e65c1419442c11c61 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 18:31:36 +0000 Subject: [PATCH 047/137] feat: Store `sha` instead of `github_url` Related 661a3851034c39c1c8249a7426ae33821f802f14 --- tools/datasets/_io.py | 13 +------------ tools/datasets/_metadata/metadata-schema.json | 2 +- tools/datasets/_metadata/metadata.parquet | Bin 19087 -> 18495 bytes tools/datasets/github.py | 3 +-- tools/datasets/models.py | 4 ++-- 5 files changed, 5 insertions(+), 17 deletions(-) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 812a9eeb0..e27bbcb7a 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -53,16 +53,6 @@ __all__ = ["Reader"] -_ItemSlice: TypeAlias = ( - "tuple[int | None, int | Literal['url_npm', 'url_github'] | None]" -) -""" -Scalar selection args for `pl.DataFrame.item`_. - -.. _pl.DataFrame.item: - https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.item.html -""" - class Reader: _read_fn: ClassVar[dict[Extension, ReadFn]] = { @@ -95,8 +85,7 @@ def url( tag: VersionTag | Literal["latest"] | None = None, ) -> str: df = self._query(**validate_constraints(name, ext, tag)) - item: _ItemSlice = (0, "url_npm") - url = df.item(*item) + url = df.item(0, "url_npm") if isinstance(url, str): return url else: diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json index d3da3f86d..53d9978b3 100644 --- a/tools/datasets/_metadata/metadata-schema.json +++ b/tools/datasets/_metadata/metadata-schema.json @@ -3,9 +3,9 @@ "ext_supported": "bool", "file_name": "str", "name_collision": "bool", + "sha": "str", "size": "int", "suffix": "str", "tag": "str", - "url_github": "str", "url_npm": "str" } \ No newline at end of file diff --git a/tools/datasets/_metadata/metadata.parquet b/tools/datasets/_metadata/metadata.parquet index 97f235546beb0c56abede1cb419eab4afb89dd9c..8bf0e17e3673d2b7cfbbe1ddba345f492d12e674 100644 GIT binary patch delta 3023 zcmaJ?c{G%L8=kT6>&zI8H)NNwO^qc@GnTPNwm~I>$&!6gCJZ8BGV<6$XsGBFvZX>V zs;`tnNq9pfiiqr6-!o3%Io~%Ol0x$i$7W-jz`7PPKfn7(ceuT-*y z4@2P)2t=((d&jF)+{F^IDLSMmQg@S6iH^Loe>S3vEbu{%LmhnbQ`4l{)@X~!komO+ZxXUC_y85Ttzsl0PGf?7{Z*n09tbLisw^AAVzU~i8w{3=+nXNO~7MP?N0--@WD z7eDpN(P{>akmu+^kOcW=Dp?!xND}?P<$!Zk`>r-mVcAh#?XK?<40V#ySdT}(r;=^! znxVl@eaVhH+P*FNjP8%*}84eo7#a&m;Gsop2 zBNYQ9$So$v9XrjZa=zj=eLmK{sJG-BlQ`q1Y^7Q}@RCT^2%u=n9(#Rhc(o!A>*PhM`8NwHxCnS@fo55ti8zv1SR2T)S)3NF+dKNQFezqU$ zPKa@HS9|9PvRF{%+4ZgpXqJm1LI}b3;r~XVZ$dtmpS&{hr3QA+$eyeabh9Oz7T^uNE6%fy1Y|T6Ou2_FF zFHCLZj*CL!yCGvFqW-G5sg?6LJ*!^ez{^l|2?$S^NwZWWJX5mF}-R*tSnmqgOb5^IjLJHw9a9 zY?5?q$VC?yxv~7Cz%2geAcl#-pIKoYQQ|GNVWD?(mDU)1Zab1Yvyu%OJp-Q8iP)m8 zr0o6&gbPo3b1$c3rvR@STgfKeH-jgs@8u4lVFIGNvFj@O0-SmQe+>V863;<v40DUge%<`JLeyBl^*`Kw}ug#Xn$a)Bgv5Ddt7_dQh!aW z*2kSLr>9?~E;^~Nth#U)8|!5mR~(iBFRqX!bp#%FWbp1-1f9P8E}x=1c>nm3nG5Nj zi(L!x)Z6s@1DXRyIeoZ$R$(1OOwq%#U*yBv6~bj7HYuW)yGapzaJlyV$CmBBk1pRJ z4$)cJisNBnmMRZoewGmY`W%unvSnaRi24nEqd!Sjw%GT*!^)zv#<44Yo7Al< zXl*nX1-+*MBkkwuC#;pqs(nH;le6M-TU6-rl(5O0^KBn-*B9gkq+~8dyBczxTK%EW zIknM!f60MqV?VW@N(sz<5O#_-$}xSdbo4PL5VLZ~y+sH8WV=d2XIbo?u?3@CT(C;i z-uab;en_Xp+Dor-7$rAz!I-4!(%~Z1cU!7JCQ)-Jy}zc*i}mYN*d{B6EL=433x@EV z6Rc{eAHSgyJs{@)92*puS^LPQvDSd;W3r!KR@_rj(j;`u@xpq>rx9H7$~wthi~s72 zTi0A-M9yJ!_a(2D!0z%7 zU{k6$0+$BA$7%S&2Udc`j8hfeux^F46I8BJ%A}NCHHj1HuL2gNQ+of(``A#Ra-zlade! zM@?gg8yJyKNO12|1poOECh$5hBK(CTIcz2s?H1x>d;gsT} z`jg>8h&fF}GZit8QRzQ)_z*{RPyvuz)PVc z{D`H!XzwPlXlkI7p2#lD+uDQFpGCoiqz^)P#H9B`{%KU260E(_7O)iDmCAM^6a@@R z$oyw`;)sWP`PFSG{0Ky4E)5FAo5`ty(_{~k3mthr63K%mM?{34gFq1dAXWy^xC4dj zK~(NAp}=2eGMfKq;2?Z%5B>#&8$eXb)<9)doZSEJAk@Mfdnt*{T(S#=S8zZO6E{%c z_Eq>09~Pi6_VkfLct=Njk`dL%pAt+)F7s%!4@L)=wJ=lg!TaIyx_EOu-WpH9;|W&y z<9NIo+qc7m#}p4Z5KH;Z*tRdxA_Q==BGBDIhd}csc?6T)z-oIMNN0dyL#)^r0gQn+ zn;4{V!h?+9+Y!$W{H6sZbl6RRTHpb*d!=XD9e@<OF5>2(N z*+7rrAgYI}VU#;bcTeu0=n!V?)`+ecUkgK;uN&cXnoIAE%Y$g4qm*GNf(lA|3HsKX_v`)e-MiLa`|NL@?|%E7ebzl)X&kAU9Cg*a4b~#2 zd!SD>KpsO(Lxb1^2lT2%L*^ke2QS9tn&eQA%{A!uYi0l7jrwc4%?Gnwba8{slMA1ggw>foKEb*Tu@a{v|pmyW^5^ zi|;!kGAy`fiG5^3O(sD#{$R0Jopp`RpRG~j{U@9Jb4a0K_&t;HWa7eshny02Wz16} z2_La-vZ$M@D}0x31j&UviDsWtA4Sd;JFoCf-F$h5r+8`RPO2RoP;$UFFd8cnWx+K> z-*2(RP1Tv%@p7@lR`cYTfx*d~qjUP&l!*^befIL^XSI2R*6NS*cg!vIK94O^Wo*ab zzjO#F7T5=*!Aox>+joixg=%_yuV(JD3pS~GQ~zvE&81({EODkqzh@E}QOH2coKrdB zUT4$VeN9(cN_)ubbBg6irPgD4gRiw~>bdYS`etRGo@8OAyNrn!)+B)%(VgJg$zN0T zwwZn&p@FxagmX{&r{)aZwuHBtpda-o*}iQ{ySCgy#-Ofcyzf{uct%~HRXZu29$jKn ze&rZnOT2RXeAc$)>GUs}5uN6YJiR7|V@ordU29TrzUTr9`Y=>So7wMS{5GisWqhTE$h3a>h`T zm9z77yxT~s6pUFAm+OS~Ii8W>XQSozJ#~w;>Um2+=x$3i#?kC4r~8-X%jT4^!5FaN zTC;AY6t5$p}6dgFf~_V&QZ+=Yk0zVx#=L<+{$*?hn)Uo~Gt2 z!lO2aUCO8OM_Qr{#~g2Rjg$_J_td2&C!x)YZLg3kCk}r6vbyBmN$!9DVsIS`NNOfM zvAS@{S7m|8jcy&Z8YtaZ?~;?l-{LKKvV9sU;*HPTSTOvkR^}q#7He@-{`SJAfK8-N z+_O~C&M2jJ@u2-(gvT`r->y9@k9{DyEZZEy=EP{G~G+gu?d~a zjEJbtr4-K9RD_Pdc5!F+4O(_aV4d}7LuwnJ8j|i&jAyg<9kOqR%3AKbZ^n21fL5)! zY|Pub^pm>zS3PQ^uSA4phmNdFm?iwTA8;l;a(n z9IaT`pU%W@rhQF!hzzU!!#iTn<`c``l$KQ8Eemc*1>U+!!yV&HY#z)M@wqgwnLfOz z-PqpW>cKd1zolCS`Pro>U;b7F>U?R=de$HH)m?l!InOg1#C-^;&jN8%oL!eI|sXl~^%3p)he%y}YvS93Oc zynJMnCNQ~t?&Ha@iU-%rmoMKYs1G=`D(U)0t~@%v zI6+X)4bM|>LZfI~_MGVOxKUAw`^hBJ^P$DGtZe34;jC6?*sgM1Rp7aa+muSK8d{uQ zNB48M(|}_}uqaG;b1~`}>M^(HX!TZkor|++LFOKQhx)1*eZx-bT)?@vr;5+_g<%sf zAF&=fWBY^xld06*=v$gUI2687Rh$*w@A~YKT&Fi@MkPZ&@mtU1<)!?wx#8PJ2l0Vn zpLQd$3)d;7sq^`iz_t^-9@?m{w0iBy^pkxC4%2f?&~cv|v)`wbROouNg)`+^Q~xwX>Fx8vph zutPPqxmUVM1@1b^x2?XqH|cagn=*9+Ng|>oNVnj5yjtpCnw>*htLy$8HrX7H4$D(< zePXIguP+DnnOsulV*xqTkuW2qo*=Qvy{vL9Hc#w=p;`Lu>{s-(RAPEAR$*>B@#gf$ zyS3+vV`fw64~%VG&hwEmIt!84i8=??dNuX-tI}az9+sJ1o7j=BM}t#sOPWgGga}XM zh~9g$MoLz!8R8N=-DW7R;9*Bn9D*k}x)HzVG-kG@P~vWcS$@l-MZQwI+u-;3;RQ(p z$-uNb1K|$Z{N+i@$w?_(GV_@>xW`LkB1&f3elD$6wUQsnV|=V%!BlhlaNX-A{|}aJ ztu&Eu+EbicC4D5F{;8ItBMy|y7xu{ehjO%kjvm)76Xk;%Q4g0+(kXGrbkdJ5buv24tSRB+k zbWEQ=4ezD*r-dCA%SocX@}(O4(~jV#;dKxjauUtzGIH%v(o zxwI2~!-^h)NN1^H!G}D0oXEI1cJ9_TC~lz1U$is0cM-}?X*>ol8i@S34ez&U-GQc9 zkRC+3)DDZ_!Kg!E0vI$bAn42)kI)|*oh5{f*(-h3g1yhXl>$Qp(J&>dzrSw?1cDsi zVJMA+A_x%acO)>}aQ}~MHIj*(*}-12FinW`o68*kq5ns1|KDEvNAH8^Bci|DR>H`O z@$98ciGm`^c0r_vHaM{0u&(HjPD|EAX0zb^$i_=hL@5-4tcww4oeXI>GJS<(AA8f$ zJbG&J5D;k$ z;C3*&s-ri>+e-}qi~`saj ParsedTree: dataset_name=path.stem, suffix=path.suffix, size=tree["size"], - url=tree["url"], + sha=tree["sha"], ext_supported=is_ext_supported(path.suffix), tag=tag, ) @@ -326,7 +326,6 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: df = ( pl.DataFrame(parsed) .lazy() - .rename({"url": "url_github"}) .with_columns(name_collision=pl.col("dataset_name").is_duplicated()) .with_columns( url_npm=pl.concat_str( diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 2bca343aa..556aafa1a 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -116,7 +116,7 @@ class ParsedTree(TypedDict): dataset_name: str suffix: str size: int - url: str + sha: str ext_supported: bool tag: str @@ -134,10 +134,10 @@ class Metadata(TypedDict, total=False): ext_supported: bool file_name: str name_collision: bool + sha: str size: int suffix: str tag: str - url_github: str url_npm: str From 669df027cef9d857f2207c77279281a8a42a03d6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 19:07:54 +0000 Subject: [PATCH 048/137] feat(perf): Adds caching to `ALTAIR_DATASETS_DIR` --- tools/datasets/__init__.py | 2 +- tools/datasets/_io.py | 51 ++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 47575278c..de98cd281 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -193,7 +193,7 @@ def __call__( **kwds: Any, ) -> pl.DataFrame: """Get a remote dataset and load as tabular data.""" - return self._reader.dataset(self.url(name, ext, tag=tag), **kwds) + return self._reader.dataset(name, ext, tag=tag, **kwds) data = DataLoader(app._from_alias("gh_trees")) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index e27bbcb7a..228bb9ce1 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -14,10 +14,11 @@ from __future__ import annotations +import os import urllib.request from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar, cast import polars as pl @@ -63,10 +64,25 @@ class Reader: } _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet} _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + _ENV_VAR: LiteralString = "ALTAIR_DATASETS_DIR" def __init__(self, fp_trees: Path, /) -> None: self._fp_trees: Path = fp_trees + @property + def _datasets_dir(self) -> Path | None: # type: ignore[return] + """ + Returns path to datasets cache, if possible. + + Requires opt-in via environment variable:: + + Reader._ENV_VAR + """ + if _dir := os.environ.get(self._ENV_VAR): + datasets_dir = Path(_dir) + datasets_dir.mkdir(exist_ok=True) + return datasets_dir + @classmethod def reader_from(cls, source: StrPath, /) -> ReadFn: suffix = validate_suffix(source, is_ext_supported) @@ -116,20 +132,41 @@ def _query( msg = f"Found no results for:\n{terms}" raise NotImplementedError(msg) - def dataset(self, url: str, /, **kwds: Any) -> pl.DataFrame: + def dataset( + self, + name: DatasetName | LiteralString, + ext: Extension | None = None, + /, + tag: VersionTag | Literal["latest"] | None = None, + **kwds: Any, + ) -> pl.DataFrame: """ - Fetch a remote dataset. + Fetch a remote dataset, attempt caching if possible. Parameters ---------- - url - Full path to a known dataset. + name, ext, tag + TODO **kwds Arguments passed to the underlying read function. """ + df = self._query(**validate_constraints(name, ext, tag)) + result = cast("Metadata", df.row(0, named=True)) + url = result["url_npm"] fn = self.reader_from(url) - with self._opener.open(url) as f: - return fn(f.read(), **kwds) + + if cache := self._datasets_dir: + fp = cache / (result["sha"] + result["suffix"]) + if fp.exists(): + return fn(fp, **kwds) + else: + fp.touch() + with self._opener.open(url) as f: + fp.write_bytes(f.read()) + return fn(fp, **kwds) + else: + with self._opener.open(url) as f: + return fn(f.read(), **kwds) def validate_constraints( From 20514100497595b52bd14e55dec0b139b4d1578a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 7 Nov 2024 23:01:06 +0000 Subject: [PATCH 049/137] feat(DRAFT): Adds initial generic backends --- tools/datasets/__init__.py | 4 +- tools/datasets/_io.py | 200 +++++++++++++++++++++++++++---------- 2 files changed, 151 insertions(+), 53 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index de98cd281..96932b9af 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -15,7 +15,7 @@ import polars as pl from tools.codemod import ruff -from tools.datasets._io import Reader +from tools.datasets._io import get_backend from tools.datasets.github import GitHub from tools.datasets.npm import Npm from tools.schemapi import utils @@ -172,7 +172,7 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: class DataLoader: def __init__(self, metadata: Path, /) -> None: - self._reader = Reader(metadata) + self._reader = get_backend("polars")(metadata) def url( self, diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 228bb9ce1..2074def12 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -17,10 +17,25 @@ import os import urllib.request from functools import partial +from itertools import chain, islice from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Literal, TypeVar, cast - +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Generic, + Literal, + Protocol, + TypeVar, + cast, + overload, +) + +import narwhals.stable.v1 as nw +import pandas as pd import polars as pl +from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT if TYPE_CHECKING: import sys @@ -40,34 +55,30 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias - from narwhals import typing as nw_typing # noqa: F401 from tools.datasets._typing import DatasetName, Extension, VersionTag from tools.datasets.models import Metadata from tools.schemapi.utils import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] - - ReadFn: TypeAlias = Callable[..., pl.DataFrame] - ScanFn: TypeAlias = Callable[..., pl.LazyFrame] _T = TypeVar("_T") -__all__ = ["Reader"] +__all__ = ["get_backend"] -class Reader: - _read_fn: ClassVar[dict[Extension, ReadFn]] = { - ".csv": pl.read_csv, - ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - } - _scan_fn: ClassVar[dict[_ExtensionScan, ScanFn]] = {".parquet": pl.scan_parquet} - _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() - _ENV_VAR: LiteralString = "ALTAIR_DATASETS_DIR" - def __init__(self, fp_trees: Path, /) -> None: - self._fp_trees: Path = fp_trees +class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): + """ + Common functionality between backends. + + Trying to use ``narwhals`` as much as possible + """ + + _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] + _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" + _metadata: Path @property def _datasets_dir(self) -> Path | None: # type: ignore[return] @@ -83,15 +94,13 @@ def _datasets_dir(self) -> Path | None: # type: ignore[return] datasets_dir.mkdir(exist_ok=True) return datasets_dir - @classmethod - def reader_from(cls, source: StrPath, /) -> ReadFn: + def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_supported) - return cls._read_fn[suffix] + return self._read_fn[suffix] - @classmethod - def scanner_from(cls, source: StrPath, /) -> ScanFn: + def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) - return cls._scan_fn[suffix] + return self._scan_fn[suffix] def url( self, @@ -108,30 +117,6 @@ def url( msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." raise TypeError(msg) - def _query( - self, *predicates: OneOrSeq[str | pl.Expr], **constraints: Unpack[Metadata] - ) -> pl.DataFrame: - r""" - Query multi-version trees metadata. - - Parameters - ---------- - \*predicates, \*\*constraints - Passed directly to `pl.LazyFrame.filter`_. - - .. _pl.LazyFrame.filter: - https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html - """ - source = self._fp_trees - fn = self.scanner_from(self._fp_trees) - results = fn(source).filter(*predicates, **constraints).collect() - if not results.is_empty(): - return results - else: - terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) - msg = f"Found no results for:\n{terms}" - raise NotImplementedError(msg) - def dataset( self, name: DatasetName | LiteralString, @@ -139,7 +124,7 @@ def dataset( /, tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, - ) -> pl.DataFrame: + ) -> IntoDataFrameT: """ Fetch a remote dataset, attempt caching if possible. @@ -151,7 +136,8 @@ def dataset( Arguments passed to the underlying read function. """ df = self._query(**validate_constraints(name, ext, tag)) - result = cast("Metadata", df.row(0, named=True)) + it = islice(df.iter_rows(named=True), 1) + result = cast("Metadata", next(it)) url = result["url_npm"] fn = self.reader_from(url) @@ -168,6 +154,91 @@ def dataset( with self._opener.open(url) as f: return fn(f.read(), **kwds) + def _query( + self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] + ) -> nw.DataFrame[IntoDataFrameT]: + r""" + Query multi-version trees metadata. + + Parameters + ---------- + \*predicates, \*\*constraints + Passed directly to `pl.LazyFrame.filter`_. + + .. _pl.LazyFrame.filter: + https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html + """ + source = self._metadata + fn = self.scanner_from(source) + frame = nw.from_native(fn(source), pass_through=False) + result = frame.filter(_filter_reduce(predicates, constraints)) + df: nw.DataFrame[Any] = ( + result.collect() if isinstance(result, nw.LazyFrame) else result + ) + if not df.is_empty(): + return df + else: + terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) + msg = f"Found no results for:\n{terms}" + raise NotImplementedError(msg) + + +class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): + _read_fn = { + ".csv": cast( + partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow") + ), + ".json": cast( + partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow") + ), + ".tsv": cast( + partial["pd.DataFrame"], + partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"), + ), + ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"), + } + _scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} + + def __init__(self, metadata: Path, /) -> None: + self._metadata = metadata + + +class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): + _read_fn = { + ".csv": pd.read_csv, + ".json": pd.read_json, + ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), + ".arrow": pd.read_feather, + } + _scan_fn = {".parquet": pd.read_parquet} + + def __init__(self, metadata: Path, /) -> None: + self._metadata = metadata + + +class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): + _read_fn = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), + } + _scan_fn = {".parquet": pl.scan_parquet} + + def __init__(self, metadata: Path, /) -> None: + self._metadata = metadata + + +def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: + """ + ``narwhals`` only accepts ``filter(*predicates)`. + + Manually converts the constraints into ``==`` + """ + return nw.all_horizontal( + chain(predicates, (nw.col(name) == v for name, v in constraints.items())) + ) + def validate_constraints( name: DatasetName | LiteralString, @@ -209,3 +280,30 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: def is_ext_supported(suffix: Any) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} + + +@overload +def get_backend(backend: Literal["polars"], /) -> type[_PolarsReader]: ... +@overload +def get_backend(backend: Literal["pandas"], /) -> type[_PandasReader]: ... +@overload +def get_backend( + backend: Literal["pandas[pyarrow]"], / +) -> type[_PandasPyArrowReader]: ... +def get_backend( + backend: Literal["polars", "pandas", "pandas[pyarrow]"], / +) -> type[_PolarsReader] | type[_PandasPyArrowReader] | type[_PandasReader]: + if backend == "polars": + return _PolarsReader + elif backend == "pandas[pyarrow]": + return _PandasPyArrowReader + elif backend == "pandas": + return _PandasReader + elif backend in {"pyarrow", "duckdb"}: + msg = "Included in ``dev``, not investigated yet" + raise NotImplementedError(msg) + elif backend in {"ibis", "cudf", "dask", "modin"}: + msg = "Supported by ``narwhals``, not investigated yet" + raise NotImplementedError(msg) + else: + raise TypeError(backend) From 0ea4e21348bcc7cf799cec11c72f19e06e1c8a49 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 10:35:11 +0000 Subject: [PATCH 050/137] feat: Generate and move `Metadata` (`TypedDict`) to `datasets._typing` --- tools/datasets/__init__.py | 47 +++++++++++++++++++++++++++++++- tools/datasets/_io.py | 3 +-- tools/datasets/_typing.py | 55 +++++++++++++++++++++++++++++++++++++- tools/datasets/models.py | 14 ---------- 4 files changed, 101 insertions(+), 18 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 96932b9af..b569e55d0 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -140,8 +140,12 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None def generate_datasets_typing(application: Application, output: Path, /) -> None: + from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT + app = application tags = app.scan("gh_tags").select("tag").collect().to_series() + metadata_schema = app.scan("gh_trees").collect_schema().to_python() + DATASET_NAME = "dataset_name" names = ( app.scan("gh_trees") @@ -152,20 +156,61 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: .collect() .to_series() ) + indent = " " * 4 NAME = "DatasetName" TAG = "VersionTag" EXT = "Extension" + METADATA_TD = "Metadata" + DESCRIPTION_DEFAULT = "_description_" + NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" + + name_collision = ( + f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}" + "Requires specifying a preference in calls to ``data(ext=...)``." + ) + sha = ( + f"Unique hash for the dataset.{NOTE_SEP}" + f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" + f"then all ``tag``(s) in this range would **share** this value." + ) + descriptions: dict[str, str] = { + "dataset_name": "Equivalent to ``Pathlib.Path.stem``.", + "ext_supported": "Dataset can be read as tabular data.", + "file_name": "Equivalent to ``Pathlib.Path.name``.", + "name_collision": name_collision, + "sha": sha, + "size": "File size (*bytes*).", + "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``", + "tag": "``vega-datasets`` release version.", + "url_npm": "Remote url used to access dataset.", + } + metadata_doc = f"\n{indent}".join( + f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" + for param in metadata_schema + ) + contents = ( f"{HEADER_COMMENT}", "from __future__ import annotations\n", "import sys", "from typing import Literal, TYPE_CHECKING", + utils.import_typing_extensions((3, 14), "TypedDict"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, TAG, EXT]}\n\n" + f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n" f"{NAME}: TypeAlias = {utils.spell_literal(names)}", f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', + UNIVERSAL_TYPED_DICT.format( + name=METADATA_TD, + metaclass_kwds=", total=False", + td_args=f"\n{indent}".join( + f"{param}: {tp.__name__}" for param, tp in metadata_schema.items() + ), + summary="Full schema for ``metadata.parquet``.", + doc=metadata_doc, + comment="", + ), ) ruff.write_lint_format(output, contents) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 2074def12..14159218d 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -56,8 +56,7 @@ else: from typing_extensions import TypeAlias - from tools.datasets._typing import DatasetName, Extension, VersionTag - from tools.datasets.models import Metadata + from tools.datasets._typing import DatasetName, Extension, Metadata, VersionTag from tools.schemapi.utils import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] diff --git a/tools/datasets/_typing.py b/tools/datasets/_typing.py index 9414aaab4..0a86bc6ba 100644 --- a/tools/datasets/_typing.py +++ b/tools/datasets/_typing.py @@ -6,13 +6,18 @@ import sys from typing import Literal +if sys.version_info >= (3, 14): + from typing import TypedDict +else: + from typing_extensions import TypedDict + if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias -__all__ = ["DatasetName", "Extension", "VersionTag"] +__all__ = ["DatasetName", "Extension", "Metadata", "VersionTag"] DatasetName: TypeAlias = Literal[ "airports", @@ -135,3 +140,51 @@ "v1.5.0", ] Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] + + +class Metadata(TypedDict, total=False): + """ + Full schema for ``metadata.parquet``. + + Parameters + ---------- + dataset_name + Equivalent to ``Pathlib.Path.stem``. + ext_supported + Dataset can be read as tabular data. + file_name + Equivalent to ``Pathlib.Path.name``. + name_collision + Dataset is available via multiple ``suffix``(s). + + .. note:: + Requires specifying a preference in calls to ``data(ext=...)``. + sha + Unique hash for the dataset. + + .. note:: + If the dataset did *not* change between ``v1.0.0``-``v2.0.0``; + + then all ``tag``(s) in this range would **share** this value. + size + File size (*bytes*). + suffix + File extension. + + .. note:: + Equivalent to ``Pathlib.Path.suffix`` + tag + ``vega-datasets`` release version. + url_npm + Remote url used to access dataset. + """ + + dataset_name: str + ext_supported: bool + file_name: str + name_collision: bool + sha: str + size: int + suffix: str + tag: str + url_npm: str diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 556aafa1a..044447707 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -127,20 +127,6 @@ class ParsedTreesResponse(TypedDict): tree: list[ParsedTree] -class Metadata(TypedDict, total=False): - """Full schema for `metadata.parquet`.""" - - dataset_name: str - ext_supported: bool - file_name: str - name_collision: bool - sha: str - size: int - suffix: str - tag: str - url_npm: str - - class GitHubRateLimit(TypedDict): limit: int used: int From a2e9baa5ddd825efedd26d3aa3a3dfe5630d4e07 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 13:30:55 +0000 Subject: [PATCH 051/137] feat: Adds optional backends, `polars[pyarrow]`, `with_backend` --- tools/datasets/__init__.py | 48 +++++++++++-- tools/datasets/_io.py | 137 +++++++++++++++++++++++-------------- 2 files changed, 127 insertions(+), 58 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index b569e55d0..864829cf6 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -10,9 +10,10 @@ import json import types from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Generic, Literal, overload import polars as pl +from narwhals.typing import IntoDataFrameT, IntoFrameT from tools.codemod import ruff from tools.datasets._io import get_backend @@ -24,6 +25,8 @@ import sys from collections.abc import Mapping + import pandas as pd + if sys.version_info >= (3, 11): from typing import LiteralString else: @@ -32,6 +35,7 @@ from typing import TypeAlias else: from typing_extensions import TypeAlias + from tools.datasets._io import _Backend, _Reader from tools.datasets._typing import DatasetName, Extension, VersionTag _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] @@ -215,9 +219,8 @@ def generate_datasets_typing(application: Application, output: Path, /) -> None: ruff.write_lint_format(output, contents) -class DataLoader: - def __init__(self, metadata: Path, /) -> None: - self._reader = get_backend("polars")(metadata) +class DataLoader(Generic[IntoDataFrameT, IntoFrameT]): + _reader: _Reader[IntoDataFrameT, IntoFrameT] def url( self, @@ -236,9 +239,40 @@ def __call__( /, tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, - ) -> pl.DataFrame: + ) -> IntoDataFrameT: """Get a remote dataset and load as tabular data.""" return self._reader.dataset(name, ext, tag=tag, **kwds) - -data = DataLoader(app._from_alias("gh_trees")) + @overload + @classmethod + def with_backend( + cls, backend: Literal["polars", "polars[pyarrow]"], / + ) -> DataLoader[pl.DataFrame, pl.LazyFrame]: ... + + @overload + @classmethod + def with_backend( + cls, backend: Literal["pandas", "pandas[pyarrow]"], / + ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ... + + @classmethod + def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]: + """ + Initialize a new loader, using the specified backend. + + Parameters + ---------- + backend + DataFrame package/config used to return data. + + * *polars*: _ + * *polars[pyarrow]*: Using ``use_pyarrow=True`` + * *pandas*: _ + * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + """ + obj = DataLoader.__new__(DataLoader) + obj._reader = get_backend(backend) + return obj + + +data = DataLoader.with_backend("polars") diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 14159218d..9bdb6e5e9 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -17,6 +17,8 @@ import os import urllib.request from functools import partial +from importlib import import_module +from importlib.util import find_spec from itertools import chain, islice from pathlib import Path from typing import ( @@ -33,14 +35,15 @@ ) import narwhals.stable.v1 as nw -import pandas as pd -import polars as pl from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT if TYPE_CHECKING: import sys from urllib.request import OpenerDirector + import pandas as pd + import polars as pl + import pyarrow as pa # noqa: F401 from _typeshed import StrPath if sys.version_info >= (3, 13): @@ -61,6 +64,9 @@ _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") + _Backend: TypeAlias = Literal[ + "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]" + ] __all__ = ["get_backend"] @@ -77,7 +83,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" - _metadata: Path + _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" @property def _datasets_dir(self) -> Path | None: # type: ignore[return] @@ -181,51 +187,76 @@ def _query( msg = f"Found no results for:\n{terms}" raise NotImplementedError(msg) + def _import(self, name: str, /) -> Any: + if spec := find_spec(name): + return import_module(spec.name) + else: + msg = f"{type(self).__name__!r} requires missing dependency {name!r}." + raise ModuleNotFoundError(msg, name=name) + + def __init__(self, *specs: str) -> None: ... + class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - _read_fn = { - ".csv": cast( - partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow") - ), - ".json": cast( - partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow") - ), - ".tsv": cast( - partial["pd.DataFrame"], - partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"), - ), - ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"), - } - _scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} - - def __init__(self, metadata: Path, /) -> None: - self._metadata = metadata + def __init__(self, _pd: str, _pa: str, /) -> None: + if not TYPE_CHECKING: + pd = self._import(_pd) + pa = self._import(_pa) # noqa: F841 + + self._read_fn = { + ".csv": cast( + partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow") + ), + ".json": cast( + partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow") + ), + ".tsv": cast( + partial["pd.DataFrame"], + partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"), + ), + ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"), + } + self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - _read_fn = { - ".csv": pd.read_csv, - ".json": pd.read_json, - ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), - ".arrow": pd.read_feather, - } - _scan_fn = {".parquet": pd.read_parquet} - - def __init__(self, metadata: Path, /) -> None: - self._metadata = metadata + def __init__(self, _pd: str, /) -> None: + if not TYPE_CHECKING: + pd = self._import(_pd) + self._read_fn = { + ".csv": pd.read_csv, + ".json": pd.read_json, + ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), + ".arrow": pd.read_feather, + } + self._scan_fn = {".parquet": pd.read_parquet} class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - _read_fn = { - ".csv": pl.read_csv, - ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - } - _scan_fn = {".parquet": pl.scan_parquet} - - def __init__(self, metadata: Path, /) -> None: - self._metadata = metadata + def __init__(self, _pl: str, /) -> None: + if not TYPE_CHECKING: + pl = self._import(_pl) + self._read_fn = { + ".csv": pl.read_csv, + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": pl.read_ipc, + } + self._scan_fn = {".parquet": pl.scan_parquet} + + +class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): + def __init__(self, _pl: str, _pa: str, /) -> None: + if not TYPE_CHECKING: + pl = self._import(_pl) + pa = self._import(_pa) # noqa: F841 + self._read_fn = { + ".csv": partial(pl.read_csv, use_pyarrow=True), + ".json": pl.read_json, + ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), + } + self._scan_fn = {".parquet": pl.scan_parquet} def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: @@ -281,23 +312,27 @@ def is_ext_supported(suffix: Any) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} -@overload -def get_backend(backend: Literal["polars"], /) -> type[_PolarsReader]: ... -@overload -def get_backend(backend: Literal["pandas"], /) -> type[_PandasReader]: ... @overload def get_backend( - backend: Literal["pandas[pyarrow]"], / -) -> type[_PandasPyArrowReader]: ... + backend: Literal["polars", "polars[pyarrow]"], / +) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... + + +@overload def get_backend( - backend: Literal["polars", "pandas", "pandas[pyarrow]"], / -) -> type[_PolarsReader] | type[_PandasPyArrowReader] | type[_PandasReader]: + backend: Literal["pandas", "pandas[pyarrow]"], / +) -> _Reader[pd.DataFrame, pd.DataFrame]: ... + + +def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: if backend == "polars": - return _PolarsReader + return _PolarsReader("polars") + elif backend == "polars[pyarrow]": + return _PolarsPyArrowReader("polars", "pyarrow") elif backend == "pandas[pyarrow]": - return _PandasPyArrowReader + return _PandasPyArrowReader("pandas", "pyarrow") elif backend == "pandas": - return _PandasReader + return _PandasReader("pandas") elif backend in {"pyarrow", "duckdb"}: msg = "Included in ``dev``, not investigated yet" raise NotImplementedError(msg) From c8a1429064d20a1ed89e7723363c52779b5650cc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 15:19:10 +0000 Subject: [PATCH 052/137] feat: Adds `pyarrow` backend --- tools/datasets/__init__.py | 7 +++++ tools/datasets/_io.py | 59 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 864829cf6..3c1c8b13d 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -26,6 +26,7 @@ from collections.abc import Mapping import pandas as pd + import pyarrow as pa if sys.version_info >= (3, 11): from typing import LiteralString @@ -255,6 +256,12 @@ def with_backend( cls, backend: Literal["pandas", "pandas[pyarrow]"], / ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ... + @overload + @classmethod + def with_backend( + cls, backend: Literal["pyarrow"], / + ) -> DataLoader[pa.Table, pa.Table]: ... + @classmethod def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]: """ diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index 9bdb6e5e9..a75d0bd17 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -43,8 +43,12 @@ import pandas as pd import polars as pl - import pyarrow as pa # noqa: F401 + import pyarrow as pa from _typeshed import StrPath + from pyarrow.csv import read_csv as pa_read_csv # noqa: F401 + from pyarrow.feather import read_table as pa_read_feather # noqa: F401 + from pyarrow.json import read_json as pa_read_json # noqa: F401 + from pyarrow.parquet import read_table as pa_read_parquet # noqa: F401 if sys.version_info >= (3, 13): from typing import TypeIs, Unpack @@ -65,7 +69,7 @@ _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") _Backend: TypeAlias = Literal[ - "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]" + "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]", "pyarrow" ] @@ -259,6 +263,49 @@ def __init__(self, _pl: str, _pa: str, /) -> None: self._scan_fn = {".parquet": pl.scan_parquet} +class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): + """ + Reader backed by `pyarrow.Table`_. + + Warning + ------- + **JSON**: Only supports `line-delimited`_ JSON. + Likely to raise the following error: + + ArrowInvalid: JSON parse error: Column() changed from object to array in row 0 + + .. _pyarrow.Table: + https://arrow.apache.org/docs/python/generated/pyarrow.Table.html + .. _line-delimited: + https://arrow.apache.org/docs/python/json.html#reading-json-files + """ + + def __init__(self, _pa: str, /) -> None: + if not TYPE_CHECKING: + pa = self._import(_pa) # noqa: F841 + pa_csv = self._import(f"{_pa}.csv") + pa_feather = self._import(f"{_pa}.feather") + pa_json = self._import(f"{_pa}.json") + pa_parquet = self._import(f"{_pa}.parquet") + + pa_read_csv = pa_csv.read_csv + pa_read_feather = pa_feather.read_table + pa_read_json = pa_json.read_json + pa_read_parquet = pa_parquet.read_table + + # opt1 = ParseOptions(delimiter="\t") # type: ignore + # Stubs suggest using a dataclass, but no way to construct it + opt2: Any = {"delimiter": "\t"} + + self._read_fn = { + ".csv": pa_read_csv, + ".json": pa_read_json, + ".tsv": partial(pa_read_csv, parse_options=opt2), + ".arrow": pa_read_feather, + } + self._scan_fn = {".parquet": pa_read_parquet} + + def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: """ ``narwhals`` only accepts ``filter(*predicates)`. @@ -324,6 +371,10 @@ def get_backend( ) -> _Reader[pd.DataFrame, pd.DataFrame]: ... +@overload +def get_backend(backend: Literal["pyarrow"], /) -> _Reader[pa.Table, pa.Table]: ... + + def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: if backend == "polars": return _PolarsReader("polars") @@ -333,7 +384,9 @@ def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: return _PandasPyArrowReader("pandas", "pyarrow") elif backend == "pandas": return _PandasReader("pandas") - elif backend in {"pyarrow", "duckdb"}: + elif backend == "pyarrow": + return _PyArrowReader("pyarrow") + elif backend == "duckdb": msg = "Included in ``dev``, not investigated yet" raise NotImplementedError(msg) elif backend in {"ibis", "cudf", "dask", "modin"}: From 279fea952007d83bd99e6cba1dfb79ca1a8ff70a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 15:19:52 +0000 Subject: [PATCH 053/137] docs: Update `.with_backend()` --- tools/datasets/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 3c1c8b13d..6592d5d93 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -272,10 +272,21 @@ def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]: backend DataFrame package/config used to return data. - * *polars*: _ + * *polars*: Using `polars defaults`_ * *polars[pyarrow]*: Using ``use_pyarrow=True`` - * *pandas*: _ + * *pandas*: Using `pandas defaults`_. * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + * *pyarrow*: (*Experimental*) + + .. warning:: + Most datasets use a `JSON format not supported`_ by ``pyarrow`` + + .. _polars defaults: + https://docs.pola.rs/api/python/stable/reference/io.html + .. _pandas defaults: + https://pandas.pydata.org/docs/reference/io.html + .. _JSON format not supported: + https://arrow.apache.org/docs/python/json.html#reading-json-files """ obj = DataLoader.__new__(DataLoader) obj._reader = get_backend(backend) From 7d6c7ca2dce60c30b3c5e0107f9a496a17cb9695 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:17:40 +0000 Subject: [PATCH 054/137] chore: Remove `duckdb` comment Not planning to support this anymore, requires `fsspec` which isn't in `dev` ``` InvalidInputException Traceback (most recent call last) Cell In[6], line 5 3 with duck._reader._opener.open(url) as f: 4 fn = duck._reader._read_fn['.json'] ----> 5 thing = fn(f.read()) InvalidInputException: Invalid Input Error: This operation could not be completed because required module 'fsspec' is not installed" ``` --- tools/datasets/_io.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/datasets/_io.py b/tools/datasets/_io.py index a75d0bd17..7989ae282 100644 --- a/tools/datasets/_io.py +++ b/tools/datasets/_io.py @@ -386,9 +386,6 @@ def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: return _PandasReader("pandas") elif backend == "pyarrow": return _PyArrowReader("pyarrow") - elif backend == "duckdb": - msg = "Included in ``dev``, not investigated yet" - raise NotImplementedError(msg) elif backend in {"ibis", "cudf", "dask", "modin"}: msg = "Supported by ``narwhals``, not investigated yet" raise NotImplementedError(msg) From 0bb4210b5aa5ff22c345946a8e73a432373529ff Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:21:09 +0000 Subject: [PATCH 055/137] ci(typing): Add `pyarrow-stubs` to `dev` dependencies Will put this in another PR, but need it here for IDE support --- pyproject.toml | 1 + tests/utils/test_utils.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ae15a8a4b..4132f0a25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ dev = [ "duckdb>=1.0", "ipython[kernel]", "pandas>=1.1.3", + "pyarrow-stubs", "pytest", "pytest-cov", "pytest-xdist[psutil]~=3.5", diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index c3b329cf0..36ed1b097 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -137,10 +137,11 @@ def test_sanitize_pyarrow_table_columns() -> None: ) # Create pyarrow table with explicit schema so that date32 type is preserved + # error: Argument 1 to "schema" has incompatible type "list[object]"; expected "Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType]" [arg-type] pa_table = pa.Table.from_pandas( df, pa.schema( - [ + ( pa.field("s", pa.string()), pa.field("f", pa.float64()), pa.field("i", pa.int64()), @@ -148,7 +149,7 @@ def test_sanitize_pyarrow_table_columns() -> None: pa.field("d", pa.date32()), pa.field("c", pa.dictionary(pa.int8(), pa.string())), pa.field("p", pa.timestamp("ns", tz="UTC")), - ] + ) ), ) sanitized = sanitize_narwhals_dataframe(nw.from_native(pa_table, eager_only=True)) From 89844253a51de27d4dac0590b013fb4f5361dd35 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 16:30:24 +0000 Subject: [PATCH 056/137] refactor: `generate_datasets_typing` -> `Application.generate_typing` --- tools/datasets/__init__.py | 150 ++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 76 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 6592d5d93..645775fb4 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -134,6 +134,80 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None with fp_schema.open("w") as f: json.dump(schema, f, indent=2) + def generate_typing(self, output: Path, /) -> None: + from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT + + tags = self.scan("gh_tags").select("tag").collect().to_series() + metadata_schema = self.scan("gh_trees").collect_schema().to_python() + + DATASET_NAME = "dataset_name" + names = ( + self.scan("gh_trees") + .filter("ext_supported") + .unique(DATASET_NAME) + .select(DATASET_NAME) + .sort(DATASET_NAME) + .collect() + .to_series() + ) + indent = " " * 4 + NAME = "DatasetName" + TAG = "VersionTag" + EXT = "Extension" + METADATA_TD = "Metadata" + DESCRIPTION_DEFAULT = "_description_" + NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" + + name_collision = ( + f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}" + "Requires specifying a preference in calls to ``data(ext=...)``." + ) + sha = ( + f"Unique hash for the dataset.{NOTE_SEP}" + f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" + f"then all ``tag``(s) in this range would **share** this value." + ) + descriptions: dict[str, str] = { + "dataset_name": "Equivalent to ``Pathlib.Path.stem``.", + "ext_supported": "Dataset can be read as tabular data.", + "file_name": "Equivalent to ``Pathlib.Path.name``.", + "name_collision": name_collision, + "sha": sha, + "size": "File size (*bytes*).", + "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``", + "tag": "``vega-datasets`` release version.", + "url_npm": "Remote url used to access dataset.", + } + metadata_doc = f"\n{indent}".join( + f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" + for param in metadata_schema + ) + + contents = ( + f"{HEADER_COMMENT}", + "from __future__ import annotations\n", + "import sys", + "from typing import Literal, TYPE_CHECKING", + utils.import_typing_extensions((3, 14), "TypedDict"), + utils.import_typing_extensions((3, 10), "TypeAlias"), + "\n", + f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n" + f"{NAME}: TypeAlias = {utils.spell_literal(names)}", + f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", + f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', + UNIVERSAL_TYPED_DICT.format( + name=METADATA_TD, + metaclass_kwds=", total=False", + td_args=f"\n{indent}".join( + f"{param}: {tp.__name__}" for param, tp in metadata_schema.items() + ), + summary="Full schema for ``metadata.parquet``.", + doc=metadata_doc, + comment="", + ), + ) + ruff.write_lint_format(output, contents) + app = Application(Path(__file__).parent / "_metadata", write_schema=True) @@ -144,82 +218,6 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None _CURRENT_SOURCE_TAG = "v2.9.0" -def generate_datasets_typing(application: Application, output: Path, /) -> None: - from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT - - app = application - tags = app.scan("gh_tags").select("tag").collect().to_series() - metadata_schema = app.scan("gh_trees").collect_schema().to_python() - - DATASET_NAME = "dataset_name" - names = ( - app.scan("gh_trees") - .filter("ext_supported") - .unique(DATASET_NAME) - .select(DATASET_NAME) - .sort(DATASET_NAME) - .collect() - .to_series() - ) - indent = " " * 4 - NAME = "DatasetName" - TAG = "VersionTag" - EXT = "Extension" - METADATA_TD = "Metadata" - DESCRIPTION_DEFAULT = "_description_" - NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" - - name_collision = ( - f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}" - "Requires specifying a preference in calls to ``data(ext=...)``." - ) - sha = ( - f"Unique hash for the dataset.{NOTE_SEP}" - f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" - f"then all ``tag``(s) in this range would **share** this value." - ) - descriptions: dict[str, str] = { - "dataset_name": "Equivalent to ``Pathlib.Path.stem``.", - "ext_supported": "Dataset can be read as tabular data.", - "file_name": "Equivalent to ``Pathlib.Path.name``.", - "name_collision": name_collision, - "sha": sha, - "size": "File size (*bytes*).", - "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``", - "tag": "``vega-datasets`` release version.", - "url_npm": "Remote url used to access dataset.", - } - metadata_doc = f"\n{indent}".join( - f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" - for param in metadata_schema - ) - - contents = ( - f"{HEADER_COMMENT}", - "from __future__ import annotations\n", - "import sys", - "from typing import Literal, TYPE_CHECKING", - utils.import_typing_extensions((3, 14), "TypedDict"), - utils.import_typing_extensions((3, 10), "TypeAlias"), - "\n", - f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n" - f"{NAME}: TypeAlias = {utils.spell_literal(names)}", - f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", - f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', - UNIVERSAL_TYPED_DICT.format( - name=METADATA_TD, - metaclass_kwds=", total=False", - td_args=f"\n{indent}".join( - f"{param}: {tp.__name__}" for param, tp in metadata_schema.items() - ), - summary="Full schema for ``metadata.parquet``.", - doc=metadata_doc, - comment="", - ), - ) - ruff.write_lint_format(output, contents) - - class DataLoader(Generic[IntoDataFrameT, IntoFrameT]): _reader: _Reader[IntoDataFrameT, IntoFrameT] From 9d062c8c8e030d4ea6b1288cf9e93692c60c78a0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:22:21 +0000 Subject: [PATCH 057/137] refactor: Split `datasets` into public/private packages - `tools.datasets`: Building & updating metadata file(s), generating annotations - `altair.datasets`: Consuming metadata, remote & cached dataset management --- altair/__init__.py | 3 +- altair/datasets/__init__.py | 117 ++++++++++++++++++ .../datasets/_metadata/metadata.parquet | Bin .../_io.py => altair/datasets/_readers.py | 11 +- {tools => altair}/datasets/_typing.py | 0 tools/datasets/__init__.py | 112 +++-------------- tools/datasets/_metadata/metadata-schema.json | 11 -- tools/datasets/github.py | 14 ++- 8 files changed, 146 insertions(+), 122 deletions(-) create mode 100644 altair/datasets/__init__.py rename {tools => altair}/datasets/_metadata/metadata.parquet (100%) rename tools/datasets/_io.py => altair/datasets/_readers.py (97%) rename {tools => altair}/datasets/_typing.py (100%) delete mode 100644 tools/datasets/_metadata/metadata-schema.json diff --git a/altair/__init__.py b/altair/__init__.py index d4e20f02f..d0d23dbaf 100644 --- a/altair/__init__.py +++ b/altair/__init__.py @@ -603,6 +603,7 @@ "core", "data", "data_transformers", + "datasets", "datum", "default_data_transformer", "display", @@ -653,7 +654,7 @@ def __dir__(): from altair.jupyter import JupyterChart from altair.expr import expr from altair.utils import AltairDeprecationWarning, parse_shorthand, Undefined -from altair import typing +from altair import typing, datasets def load_ipython_extension(ipython): diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py new file mode 100644 index 000000000..15c8069f9 --- /dev/null +++ b/altair/datasets/__init__.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Generic, overload + +from narwhals.typing import IntoDataFrameT, IntoFrameT + +from altair.datasets._readers import _Reader, get_backend + +if TYPE_CHECKING: + import sys + from typing import Any, Literal + + import pandas as pd + import polars as pl + import pyarrow as pa + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from altair.datasets._readers import _Backend + from altair.datasets._typing import DatasetName, Extension, VersionTag + +__all__ = ["Loader", "data"] + + +class Loader(Generic[IntoDataFrameT, IntoFrameT]): + _reader: _Reader[IntoDataFrameT, IntoFrameT] + + def url( + self, + name: DatasetName | LiteralString, + ext: Extension | None = None, + /, + tag: VersionTag | Literal["latest"] | None = None, + ) -> str: + """Return the address of a remote dataset.""" + return self._reader.url(name, ext, tag=tag) + + def __call__( + self, + name: DatasetName | LiteralString, + ext: Extension | None = None, + /, + tag: VersionTag | Literal["latest"] | None = None, + **kwds: Any, + ) -> IntoDataFrameT: + """Get a remote dataset and load as tabular data.""" + return self._reader.dataset(name, ext, tag=tag, **kwds) + + def __repr__(self) -> str: + return f"{type(self).__name__}[{type(self._reader).__name__}]" + + @overload + @classmethod + def with_backend( + cls, backend: Literal["polars", "polars[pyarrow]"], / + ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... + + @overload + @classmethod + def with_backend( + cls, backend: Literal["pandas", "pandas[pyarrow]"], / + ) -> Loader[pd.DataFrame, pd.DataFrame]: ... + + @overload + @classmethod + def with_backend( + cls, backend: Literal["pyarrow"], / + ) -> Loader[pa.Table, pa.Table]: ... + + @classmethod + def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: + """ + Initialize a new loader, using the specified backend. + + Parameters + ---------- + backend + DataFrame package/config used to return data. + + * *polars*: Using `polars defaults`_ + * *polars[pyarrow]*: Using ``use_pyarrow=True`` + * *pandas*: Using `pandas defaults`_. + * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + * *pyarrow*: (*Experimental*) + + .. warning:: + Most datasets use a `JSON format not supported`_ by ``pyarrow`` + + .. _polars defaults: + https://docs.pola.rs/api/python/stable/reference/io.html + .. _pandas defaults: + https://pandas.pydata.org/docs/reference/io.html + .. _JSON format not supported: + https://arrow.apache.org/docs/python/json.html#reading-json-files + """ + obj = Loader.__new__(Loader) + obj._reader = get_backend(backend) + return obj + + +def __getattr__(name): + if name == "data": + global data + data = Loader.with_backend("pandas") + from altair.utils.deprecation import deprecated_warn + + deprecated_warn( + "Added only for backwards compatibility with `altair-viz/vega_datasets`.", + version="5.5.0", + alternative="altair.datasets.Loader.with_backend(...)", + stacklevel=3, + ) + return data + else: + raise AttributeError(name) diff --git a/tools/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet similarity index 100% rename from tools/datasets/_metadata/metadata.parquet rename to altair/datasets/_metadata/metadata.parquet diff --git a/tools/datasets/_io.py b/altair/datasets/_readers.py similarity index 97% rename from tools/datasets/_io.py rename to altair/datasets/_readers.py index 7989ae282..cbb02cd00 100644 --- a/tools/datasets/_io.py +++ b/altair/datasets/_readers.py @@ -1,15 +1,10 @@ """ -Will be part of the public ``alt.datasets`` subpackage. +Backends for ``alt.datasets.Loader``. - Interfacing with the cached metadata. - But not updating it - Performing requests from those urls - Dispatching read function on file extension - -Note ----- -- Building with ``polars`` first, then will work backwards with ``narwhals``. - - Since ``narwhals`` is a subset of ``polars`` """ from __future__ import annotations @@ -63,8 +58,8 @@ else: from typing_extensions import TypeAlias - from tools.datasets._typing import DatasetName, Extension, Metadata, VersionTag - from tools.schemapi.utils import OneOrSeq + from altair.datasets._typing import DatasetName, Extension, Metadata, VersionTag + from altair.vegalite.v5.schema._typing import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") diff --git a/tools/datasets/_typing.py b/altair/datasets/_typing.py similarity index 100% rename from tools/datasets/_typing.py rename to altair/datasets/_typing.py diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 645775fb4..d9b00d9a5 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -10,13 +10,11 @@ import json import types from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Literal, overload +from typing import TYPE_CHECKING, Any, Literal import polars as pl -from narwhals.typing import IntoDataFrameT, IntoFrameT from tools.codemod import ruff -from tools.datasets._io import get_backend from tools.datasets.github import GitHub from tools.datasets.npm import Npm from tools.schemapi import utils @@ -25,25 +23,14 @@ import sys from collections.abc import Mapping - import pandas as pd - import pyarrow as pa - - if sys.version_info >= (3, 11): - from typing import LiteralString - else: - from typing_extensions import LiteralString if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias - from tools.datasets._io import _Backend, _Reader - from tools.datasets._typing import DatasetName, Extension, VersionTag _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] - WorkInProgress: TypeAlias = Any - -__all__ = ["app", "data"] +__all__ = ["app"] HEADER_COMMENT = """\ # The contents of this file are automatically written by @@ -61,7 +48,8 @@ class Application: def __init__( self, - output_dir: Path, + out_dir_tools: Path, + out_dir_altair: Path, *, write_schema: bool, trees_gh: str = "metadata", @@ -70,14 +58,18 @@ def __init__( kwds_gh: Mapping[str, Any] | None = None, kwds_npm: Mapping[str, Any] | None = None, ) -> None: - output_dir.mkdir(exist_ok=True) + out_dir_tools.mkdir(exist_ok=True) kwds_gh = kwds_gh or {} kwds_npm = kwds_npm or {} self._write_schema: bool = write_schema self._github: GitHub = GitHub( - output_dir, name_tags=tags_gh, name_trees=trees_gh, **kwds_gh + out_dir_tools, + out_dir_altair, + name_tags=tags_gh, + name_trees=trees_gh, + **kwds_gh, ) - self._npm: Npm = Npm(output_dir, name_tags=tags_npm, **kwds_npm) + self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm) self._paths = types.MappingProxyType["_PathAlias", Path]( { "npm_tags": self.npm._paths["tags"], @@ -209,86 +201,14 @@ def generate_typing(self, output: Path, /) -> None: ruff.write_lint_format(output, contents) -app = Application(Path(__file__).parent / "_metadata", write_schema=True) +app = Application( + Path(__file__).parent / "_metadata", + Path(__file__).parent.parent.parent / "altair" / "datasets" / "_metadata", + write_schema=False, +) # This is the tag in http://github.com/vega/vega-datasets from # which the datasets in this repository are sourced. _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" - - -class DataLoader(Generic[IntoDataFrameT, IntoFrameT]): - _reader: _Reader[IntoDataFrameT, IntoFrameT] - - def url( - self, - name: DatasetName | LiteralString, - ext: Extension | None = None, - /, - tag: VersionTag | Literal["latest"] | None = None, - ) -> str: - """Return the address of a remote dataset.""" - return self._reader.url(name, ext, tag=tag) - - def __call__( - self, - name: DatasetName | LiteralString, - ext: Extension | None = None, - /, - tag: VersionTag | Literal["latest"] | None = None, - **kwds: Any, - ) -> IntoDataFrameT: - """Get a remote dataset and load as tabular data.""" - return self._reader.dataset(name, ext, tag=tag, **kwds) - - @overload - @classmethod - def with_backend( - cls, backend: Literal["polars", "polars[pyarrow]"], / - ) -> DataLoader[pl.DataFrame, pl.LazyFrame]: ... - - @overload - @classmethod - def with_backend( - cls, backend: Literal["pandas", "pandas[pyarrow]"], / - ) -> DataLoader[pd.DataFrame, pd.DataFrame]: ... - - @overload - @classmethod - def with_backend( - cls, backend: Literal["pyarrow"], / - ) -> DataLoader[pa.Table, pa.Table]: ... - - @classmethod - def with_backend(cls, backend: _Backend, /) -> DataLoader[Any, Any]: - """ - Initialize a new loader, using the specified backend. - - Parameters - ---------- - backend - DataFrame package/config used to return data. - - * *polars*: Using `polars defaults`_ - * *polars[pyarrow]*: Using ``use_pyarrow=True`` - * *pandas*: Using `pandas defaults`_. - * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` - * *pyarrow*: (*Experimental*) - - .. warning:: - Most datasets use a `JSON format not supported`_ by ``pyarrow`` - - .. _polars defaults: - https://docs.pola.rs/api/python/stable/reference/io.html - .. _pandas defaults: - https://pandas.pydata.org/docs/reference/io.html - .. _JSON format not supported: - https://arrow.apache.org/docs/python/json.html#reading-json-files - """ - obj = DataLoader.__new__(DataLoader) - obj._reader = get_backend(backend) - return obj - - -data = DataLoader.with_backend("polars") diff --git a/tools/datasets/_metadata/metadata-schema.json b/tools/datasets/_metadata/metadata-schema.json deleted file mode 100644 index 53d9978b3..000000000 --- a/tools/datasets/_metadata/metadata-schema.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "dataset_name": "str", - "ext_supported": "bool", - "file_name": "str", - "name_collision": "bool", - "sha": "str", - "size": "int", - "suffix": "str", - "tag": "str", - "url_npm": "str" -} \ No newline at end of file diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 0238aab69..8b58e8690 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -37,7 +37,7 @@ from email.message import Message from urllib.request import OpenerDirector, Request - from tools.datasets._typing import Extension + from altair.datasets._typing import Extension if sys.version_info >= (3, 13): from typing import TypeIs @@ -270,7 +270,8 @@ class GitHub: def __init__( self, - output_dir: Path, + out_dir_tools: Path, + out_dir_altair: Path, name_tags: str, name_trees: str, *, @@ -278,11 +279,12 @@ def __init__( org: LiteralString = "vega", package: LiteralString = "vega-datasets", ) -> None: - output_dir.mkdir(exist_ok=True) + out_dir_tools.mkdir(exist_ok=True) + out_dir_altair.mkdir(exist_ok=True) self._paths: dict[_PathName, Path] = { - "dir": output_dir, - "tags": output_dir / f"{name_tags}.parquet", - "trees": output_dir / f"{name_trees}.parquet", + "dir": out_dir_tools, + "tags": out_dir_tools / f"{name_tags}.parquet", + "trees": out_dir_altair / f"{name_trees}.parquet", } repo = f"{base_url}repos/{org}/{package}/" self._url = GitHubUrl( From a17d674303558f0989b2aaac835efa3d04de80cc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:44:57 +0000 Subject: [PATCH 058/137] refactor: Provide `npm` url to `GitHub(...)` --- tools/datasets/__init__.py | 3 ++- tools/datasets/github.py | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index d9b00d9a5..6319bd65e 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -62,14 +62,15 @@ def __init__( kwds_gh = kwds_gh or {} kwds_npm = kwds_npm or {} self._write_schema: bool = write_schema + self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm) self._github: GitHub = GitHub( out_dir_tools, out_dir_altair, name_tags=tags_gh, name_trees=trees_gh, + npm_cdn_url=self._npm.url.CDN, **kwds_gh, ) - self._npm: Npm = Npm(out_dir_tools, name_tags=tags_npm, **kwds_npm) self._paths = types.MappingProxyType["_PathAlias", Path]( { "npm_tags": self.npm._paths["tags"], diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 8b58e8690..c2d7141aa 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -59,10 +59,7 @@ _TD = TypeVar("_TD", bound=Mapping[str, Any]) - -# TODO: Work on where these should live/be accessed -_NPM_BASE_URL = "https://cdn.jsdelivr.net/npm/vega-datasets@" -_SUB_DIR = "data" +_DATA = "data" def is_ext_supported(suffix: str) -> TypeIs[Extension]: @@ -152,7 +149,7 @@ def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: url = tag["trees_url"] with self._gh._opener.open(self._request(url)) as response: content: GitHubTreesResponse = json.load(response) - query = (tree["url"] for tree in content["tree"] if tree["path"] == _SUB_DIR) + query = (tree["url"] for tree in content["tree"] if tree["path"] == _DATA) if data_url := next(query, None): with self._gh._opener.open(self._request(data_url)) as response: data_dir: GitHubTreesResponse = json.load(response) @@ -237,12 +234,13 @@ def tag_from_str(self, s: str, /) -> str: # - Trees url (using ref name) # - npm url (works w/o the `v` prefix) trees_url = self.url.TREES + npm_url = self._gh._npm_cdn_url if s.startswith("v"): return s elif s.startswith(trees_url): return s.replace(trees_url, "") - elif s.startswith(_NPM_BASE_URL): - s, _ = s.replace(_NPM_BASE_URL, "").split("/") + elif s.startswith(npm_url): + s, _ = s.replace(npm_url, "").split("/") return s if s.startswith("v") else f"v{s}" else: raise TypeError(s) @@ -275,6 +273,7 @@ def __init__( name_tags: str, name_trees: str, *, + npm_cdn_url: LiteralString, base_url: LiteralString = "https://api.github.com/", org: LiteralString = "vega", package: LiteralString = "vega-datasets", @@ -295,6 +294,7 @@ def __init__( TAGS=f"{repo}tags", TREES=f"{repo}git/trees/", ) + self._npm_cdn_url: LiteralString = npm_cdn_url @property def req(self) -> _GitHubRequestNamespace: @@ -331,9 +331,9 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: .with_columns(name_collision=pl.col("dataset_name").is_duplicated()) .with_columns( url_npm=pl.concat_str( - pl.lit(_NPM_BASE_URL), + pl.lit(self._npm_cdn_url), pl.col("tag"), - pl.lit(f"/{_SUB_DIR}/"), + pl.lit(f"/{_DATA}/"), pl.col("file_name"), ) ) @@ -345,7 +345,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: """ Use known tags to discover and update missing trees metadata. - Aims to stay well-within API rate limits, both for authenticated ad unauthenticated users. + Aims to stay well-within API rate limits, both for authenticated and unauthenticated users. """ if gh_tags.is_empty(): msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}" From 69a619caeaa803599dfc080ed7b3b34f0ca10386 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:55:42 +0000 Subject: [PATCH 059/137] refactor: Rename `ext` -> `suffix` --- altair/datasets/__init__.py | 8 ++++---- altair/datasets/_readers.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 15c8069f9..0db434979 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -30,23 +30,23 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): def url( self, name: DatasetName | LiteralString, - ext: Extension | None = None, + suffix: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, ) -> str: """Return the address of a remote dataset.""" - return self._reader.url(name, ext, tag=tag) + return self._reader.url(name, suffix, tag=tag) def __call__( self, name: DatasetName | LiteralString, - ext: Extension | None = None, + suffix: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, ) -> IntoDataFrameT: """Get a remote dataset and load as tabular data.""" - return self._reader.dataset(name, ext, tag=tag, **kwds) + return self._reader.dataset(name, suffix, tag=tag, **kwds) def __repr__(self) -> str: return f"{type(self).__name__}[{type(self._reader).__name__}]" diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index cbb02cd00..cebbe1526 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -109,11 +109,11 @@ def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]: def url( self, name: DatasetName | LiteralString, - ext: Extension | None = None, + suffix: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, ) -> str: - df = self._query(**validate_constraints(name, ext, tag)) + df = self._query(**validate_constraints(name, suffix, tag)) url = df.item(0, "url_npm") if isinstance(url, str): return url @@ -124,7 +124,7 @@ def url( def dataset( self, name: DatasetName | LiteralString, - ext: Extension | None = None, + suffix: Extension | None = None, /, tag: VersionTag | Literal["latest"] | None = None, **kwds: Any, @@ -134,12 +134,12 @@ def dataset( Parameters ---------- - name, ext, tag + name, suffix, tag TODO **kwds Arguments passed to the underlying read function. """ - df = self._query(**validate_constraints(name, ext, tag)) + df = self._query(**validate_constraints(name, suffix, tag)) it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) url = result["url_npm"] @@ -314,7 +314,7 @@ def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw. def validate_constraints( name: DatasetName | LiteralString, - ext: Extension | None, + suffix: Extension | None, tag: VersionTag | Literal["latest"] | None, /, ) -> Metadata: @@ -328,11 +328,11 @@ def validate_constraints( constraints["dataset_name"] = fp.stem constraints["suffix"] = fp.suffix return constraints - elif ext is not None: - if not is_ext_supported(ext): - raise TypeError(ext) + elif suffix is not None: + if not is_ext_supported(suffix): + raise TypeError(suffix) else: - constraints["suffix"] = ext + constraints["suffix"] = suffix constraints["dataset_name"] = name return constraints From a259b1070e1a2dcd356992bc6fd95982cf6b9ef6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 8 Nov 2024 18:57:31 +0000 Subject: [PATCH 060/137] refactor: Remove unimplemented `tag="latest"` Since `metadata.parquet` is sorted, this was already the behavior when not providing a tag --- altair/datasets/__init__.py | 4 ++-- altair/datasets/_readers.py | 10 ++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 0db434979..c2ccee2fe 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -32,7 +32,7 @@ def url( name: DatasetName | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | Literal["latest"] | None = None, + tag: VersionTag | None = None, ) -> str: """Return the address of a remote dataset.""" return self._reader.url(name, suffix, tag=tag) @@ -42,7 +42,7 @@ def __call__( name: DatasetName | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | Literal["latest"] | None = None, + tag: VersionTag | None = None, **kwds: Any, ) -> IntoDataFrameT: """Get a remote dataset and load as tabular data.""" diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index cebbe1526..b344bd67a 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -111,7 +111,7 @@ def url( name: DatasetName | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | Literal["latest"] | None = None, + tag: VersionTag | None = None, ) -> str: df = self._query(**validate_constraints(name, suffix, tag)) url = df.item(0, "url_npm") @@ -126,7 +126,7 @@ def dataset( name: DatasetName | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | Literal["latest"] | None = None, + tag: VersionTag | None = None, **kwds: Any, ) -> IntoDataFrameT: """ @@ -315,13 +315,11 @@ def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw. def validate_constraints( name: DatasetName | LiteralString, suffix: Extension | None, - tag: VersionTag | Literal["latest"] | None, + tag: VersionTag | None, /, ) -> Metadata: constraints: Metadata = {} - if tag == "latest": - raise NotImplementedError(tag) - elif tag is not None: + if tag is not None: constraints["tag"] = tag if name.endswith((".csv", ".json", ".tsv", ".arrow")): fp = Path(name) From 88968c8bf188f5c6817fac2edf3c0b8a44602ec3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 11:56:01 +0000 Subject: [PATCH 061/137] feat: Rename `_datasets_dir`, make configurable, add docs Still on the fence about `Loader.cache_dir` vs `Loader.cache` --- altair/datasets/__init__.py | 31 +++++++++++++++++++++++++++++++ altair/datasets/_readers.py | 10 +++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index c2ccee2fe..c89163a48 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -8,11 +8,13 @@ if TYPE_CHECKING: import sys + from pathlib import Path from typing import Any, Literal import pandas as pd import polars as pl import pyarrow as pa + from _typeshed import StrPath if sys.version_info >= (3, 11): from typing import LiteralString @@ -99,6 +101,35 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: obj._reader = get_backend(backend) return obj + @property + def cache_dir(self) -> Path | None: + """ + Returns path to datasets cache. + + By default, this can be configured using the environment variable: + + "ALTAIR_DATASETS_DIR" + + You *may* also set this directly, but the value will **not** persist between sessions: + + from pathlib import Path + + from altair.datasets import Loader + + data = Loader.with_backend("polars") + data.cache_dir = Path.home() / ".altair_cache" + + data.cache_dir.relative_to(Path.home()).as_posix() + '.altair_cache' + """ + return self._reader._cache + + @cache_dir.setter + def cache_dir(self, source: StrPath, /) -> None: + import os + + os.environ[self._reader._ENV_VAR] = str(source) + def __getattr__(name): if name == "data": diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index b344bd67a..673e2e6d1 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -85,7 +85,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" @property - def _datasets_dir(self) -> Path | None: # type: ignore[return] + def _cache(self) -> Path | None: # type: ignore[return] """ Returns path to datasets cache, if possible. @@ -94,9 +94,9 @@ def _datasets_dir(self) -> Path | None: # type: ignore[return] Reader._ENV_VAR """ if _dir := os.environ.get(self._ENV_VAR): - datasets_dir = Path(_dir) - datasets_dir.mkdir(exist_ok=True) - return datasets_dir + cache_dir = Path(_dir) + cache_dir.mkdir(exist_ok=True) + return cache_dir def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_supported) @@ -145,7 +145,7 @@ def dataset( url = result["url_npm"] fn = self.reader_from(url) - if cache := self._datasets_dir: + if cache := self._cache: fp = cache / (result["sha"] + result["suffix"]) if fp.exists(): return fn(fp, **kwds) From b98730887d0392ac0a2fbb5d226f5013862201c3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 12:13:25 +0000 Subject: [PATCH 062/137] docs: Adds examples to `Loader.with_backend` --- altair/datasets/__init__.py | 49 ++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index c89163a48..4bcf768b6 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -27,6 +27,13 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): + """ + Load examples **remotely** from `vega-datasets`_, with *optional* caching. + + .. _vega-datasets: + https://github.com/vega/vega-datasets + """ + _reader: _Reader[IntoDataFrameT, IntoFrameT] def url( @@ -74,7 +81,7 @@ def with_backend( @classmethod def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: """ - Initialize a new loader, using the specified backend. + Initialize a new loader, with the specified backend. Parameters ---------- @@ -96,6 +103,46 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: https://pandas.pydata.org/docs/reference/io.html .. _JSON format not supported: https://arrow.apache.org/docs/python/json.html#reading-json-files + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.with_backend("polars") + cars = data("cars") + + type(cars) + polars.dataframe.frame.DataFrame + + Using ``pandas``: + + data = Loader.with_backend("pandas") + cars = data("cars") + + type(cars) + pandas.core.frame.DataFrame + + Using ``pandas``, backed by ``pyarrow`` dtypes: + + data = Loader.with_backend("pandas[pyarrow]") + cars = data("cars", tag="v1.29.0") + + type(cars) + pandas.core.frame.DataFrame + + cars.dtypes + Name string[pyarrow] + Miles_per_Gallon double[pyarrow] + Cylinders int64[pyarrow] + Displacement double[pyarrow] + Horsepower int64[pyarrow] + Weight_in_lbs int64[pyarrow] + Acceleration double[pyarrow] + Year string[pyarrow] + Origin string[pyarrow] + dtype: object """ obj = Loader.__new__(Loader) obj._reader = get_backend(backend) From 4a2a2e068f85d118244ceda09350cf3690781227 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 14:59:40 +0000 Subject: [PATCH 063/137] refactor: Clean up requirements -> imports --- altair/datasets/_readers.py | 100 ++++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 28 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 673e2e6d1..78ee784a6 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -63,9 +63,14 @@ _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") - _Backend: TypeAlias = Literal[ - "polars", "pandas", "pandas[pyarrow]", "polars[pyarrow]", "pyarrow" - ] + + _Polars: TypeAlias = Literal["polars"] + _Pandas: TypeAlias = Literal["pandas"] + _PyArrow: TypeAlias = Literal["pyarrow"] + _ConcreteT = TypeVar("_ConcreteT", _Polars, _Pandas, _PyArrow) + _PolarsAny: TypeAlias = Literal[_Polars, "polars[pyarrow]"] + _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"] + _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow] __all__ = ["get_backend"] @@ -80,6 +85,7 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] + _name: LiteralString _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" @@ -193,11 +199,16 @@ def _import(self, name: str, /) -> Any: msg = f"{type(self).__name__!r} requires missing dependency {name!r}." raise ModuleNotFoundError(msg, name=name) - def __init__(self, *specs: str) -> None: ... + def __repr__(self) -> str: + return f"Reader[{self._name}]" + + def __init__(self, name: LiteralString, /) -> None: ... class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - def __init__(self, _pd: str, _pa: str, /) -> None: + def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: + _pd, _pa = _requirements(name) + self._name = name if not TYPE_CHECKING: pd = self._import(_pd) pa = self._import(_pa) # noqa: F841 @@ -219,9 +230,10 @@ def __init__(self, _pd: str, _pa: str, /) -> None: class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - def __init__(self, _pd: str, /) -> None: + def __init__(self, name: _Pandas, /) -> None: + self._name = _requirements(name) if not TYPE_CHECKING: - pd = self._import(_pd) + pd = self._import(self._name) self._read_fn = { ".csv": pd.read_csv, ".json": pd.read_json, @@ -232,9 +244,10 @@ def __init__(self, _pd: str, /) -> None: class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - def __init__(self, _pl: str, /) -> None: + def __init__(self, name: _Polars, /) -> None: + self._name = _requirements(name) if not TYPE_CHECKING: - pl = self._import(_pl) + pl = self._import(self._name) self._read_fn = { ".csv": pl.read_csv, ".json": pl.read_json, @@ -245,7 +258,9 @@ def __init__(self, _pl: str, /) -> None: class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - def __init__(self, _pl: str, _pa: str, /) -> None: + def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: + _pl, _pa = _requirements(name) + self._name = name if not TYPE_CHECKING: pl = self._import(_pl) pa = self._import(_pa) # noqa: F841 @@ -275,13 +290,14 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): https://arrow.apache.org/docs/python/json.html#reading-json-files """ - def __init__(self, _pa: str, /) -> None: + def __init__(self, name: _PyArrow, /) -> None: + self._name = _requirements(name) if not TYPE_CHECKING: - pa = self._import(_pa) # noqa: F841 - pa_csv = self._import(f"{_pa}.csv") - pa_feather = self._import(f"{_pa}.feather") - pa_json = self._import(f"{_pa}.json") - pa_parquet = self._import(f"{_pa}.parquet") + pa = self._import(self._name) # noqa: F841 + pa_csv = self._import(f"{self._name}.csv") + pa_feather = self._import(f"{self._name}.feather") + pa_json = self._import(f"{self._name}.json") + pa_parquet = self._import(f"{self._name}.parquet") pa_read_csv = pa_csv.read_csv pa_read_feather = pa_feather.read_table @@ -353,34 +369,62 @@ def is_ext_supported(suffix: Any) -> TypeIs[Extension]: @overload -def get_backend( - backend: Literal["polars", "polars[pyarrow]"], / -) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... +def get_backend(backend: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... @overload -def get_backend( - backend: Literal["pandas", "pandas[pyarrow]"], / -) -> _Reader[pd.DataFrame, pd.DataFrame]: ... +def get_backend(backend: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ... @overload -def get_backend(backend: Literal["pyarrow"], /) -> _Reader[pa.Table, pa.Table]: ... +def get_backend(backend: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ... def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: if backend == "polars": - return _PolarsReader("polars") + return _PolarsReader(backend) elif backend == "polars[pyarrow]": - return _PolarsPyArrowReader("polars", "pyarrow") + return _PolarsPyArrowReader(backend) elif backend == "pandas[pyarrow]": - return _PandasPyArrowReader("pandas", "pyarrow") + return _PandasPyArrowReader(backend) elif backend == "pandas": - return _PandasReader("pandas") + return _PandasReader(backend) elif backend == "pyarrow": - return _PyArrowReader("pyarrow") + return _PyArrowReader(backend) elif backend in {"ibis", "cudf", "dask", "modin"}: msg = "Supported by ``narwhals``, not investigated yet" raise NotImplementedError(msg) else: raise TypeError(backend) + + +@overload +def _requirements(s: _ConcreteT, /) -> _ConcreteT: ... + + +@overload +def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ... + + +@overload +def _requirements(s: Literal["polars[pyarrow]"], /) -> tuple[_Polars, _PyArrow]: ... + + +def _requirements(s: _Backend, /): + concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"} + if s in concrete: + return s + else: + from packaging.requirements import Requirement + + req = Requirement(s) + supports_extras: set[Literal[_Polars, _Pandas]] = {"polars", "pandas"} + if req.name in supports_extras: + name = req.name + if (extras := req.extras) and extras == {"pyarrow"}: + extra = "pyarrow" + return name, extra + else: + raise NotImplementedError(s) + else: + raise NotImplementedError(s) From e6dd27e6fb680b965e7d698a636d47a389c3e7df Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 15:03:17 +0000 Subject: [PATCH 064/137] docs: Add basic example to `Loader` class Also incorporates changes from previous commit into `__repr__` 4a2a2e068f85d118244ceda09350cf3690781227 --- altair/datasets/__init__.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 4bcf768b6..6d7a922d3 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -30,12 +30,20 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ Load examples **remotely** from `vega-datasets`_, with *optional* caching. + A new ``Loader`` must be initialized by specifying a backend: + + from altair.datasets import Loader + + data = Loader.with_backend("polars") + Loader[polars] + .. _vega-datasets: https://github.com/vega/vega-datasets """ _reader: _Reader[IntoDataFrameT, IntoFrameT] + # TODO: docs (parameters, examples) def url( self, name: DatasetName | LiteralString, @@ -46,6 +54,7 @@ def url( """Return the address of a remote dataset.""" return self._reader.url(name, suffix, tag=tag) + # TODO: docs (parameters, examples) def __call__( self, name: DatasetName | LiteralString, @@ -58,7 +67,7 @@ def __call__( return self._reader.dataset(name, suffix, tag=tag, **kwds) def __repr__(self) -> str: - return f"{type(self).__name__}[{type(self._reader).__name__}]" + return f"{type(self).__name__}[{self._reader._name}]" @overload @classmethod From 2a7bc4f5bbcfea11e416453fa00abbee11ad8c5b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 15:50:51 +0000 Subject: [PATCH 065/137] refactor: Reorder `alt.datasets` module --- altair/datasets/__init__.py | 52 ++++++++++----------- altair/datasets/_readers.py | 92 ++++++++++++++++++------------------- 2 files changed, 72 insertions(+), 72 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 6d7a922d3..260258882 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -43,32 +43,6 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): _reader: _Reader[IntoDataFrameT, IntoFrameT] - # TODO: docs (parameters, examples) - def url( - self, - name: DatasetName | LiteralString, - suffix: Extension | None = None, - /, - tag: VersionTag | None = None, - ) -> str: - """Return the address of a remote dataset.""" - return self._reader.url(name, suffix, tag=tag) - - # TODO: docs (parameters, examples) - def __call__( - self, - name: DatasetName | LiteralString, - suffix: Extension | None = None, - /, - tag: VersionTag | None = None, - **kwds: Any, - ) -> IntoDataFrameT: - """Get a remote dataset and load as tabular data.""" - return self._reader.dataset(name, suffix, tag=tag, **kwds) - - def __repr__(self) -> str: - return f"{type(self).__name__}[{self._reader._name}]" - @overload @classmethod def with_backend( @@ -157,6 +131,29 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: obj._reader = get_backend(backend) return obj + # TODO: docs (parameters, examples) + def __call__( + self, + name: DatasetName | LiteralString, + suffix: Extension | None = None, + /, + tag: VersionTag | None = None, + **kwds: Any, + ) -> IntoDataFrameT: + """Get a remote dataset and load as tabular data.""" + return self._reader.dataset(name, suffix, tag=tag, **kwds) + + # TODO: docs (parameters, examples) + def url( + self, + name: DatasetName | LiteralString, + suffix: Extension | None = None, + /, + tag: VersionTag | None = None, + ) -> str: + """Return the address of a remote dataset.""" + return self._reader.url(name, suffix, tag=tag) + @property def cache_dir(self) -> Path | None: """ @@ -186,6 +183,9 @@ def cache_dir(self, source: StrPath, /) -> None: os.environ[self._reader._ENV_VAR] = str(source) + def __repr__(self) -> str: + return f"{type(self).__name__}[{self._reader._name}]" + def __getattr__(name): if name == "data": diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 78ee784a6..53a18b2d6 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -86,24 +86,10 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] _name: LiteralString - _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" + _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" - @property - def _cache(self) -> Path | None: # type: ignore[return] - """ - Returns path to datasets cache, if possible. - - Requires opt-in via environment variable:: - - Reader._ENV_VAR - """ - if _dir := os.environ.get(self._ENV_VAR): - cache_dir = Path(_dir) - cache_dir.mkdir(exist_ok=True) - return cache_dir - def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_supported) return self._read_fn[suffix] @@ -112,21 +98,6 @@ def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) return self._scan_fn[suffix] - def url( - self, - name: DatasetName | LiteralString, - suffix: Extension | None = None, - /, - tag: VersionTag | None = None, - ) -> str: - df = self._query(**validate_constraints(name, suffix, tag)) - url = df.item(0, "url_npm") - if isinstance(url, str): - return url - else: - msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." - raise TypeError(msg) - def dataset( self, name: DatasetName | LiteralString, @@ -145,7 +116,7 @@ def dataset( **kwds Arguments passed to the underlying read function. """ - df = self._query(**validate_constraints(name, suffix, tag)) + df = self.query(**validate_constraints(name, suffix, tag)) it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) url = result["url_npm"] @@ -164,7 +135,22 @@ def dataset( with self._opener.open(url) as f: return fn(f.read(), **kwds) - def _query( + def url( + self, + name: DatasetName | LiteralString, + suffix: Extension | None = None, + /, + tag: VersionTag | None = None, + ) -> str: + df = self.query(**validate_constraints(name, suffix, tag)) + url = df.item(0, "url_npm") + if isinstance(url, str): + return url + else: + msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." + raise TypeError(msg) + + def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] ) -> nw.DataFrame[IntoDataFrameT]: r""" @@ -192,6 +178,20 @@ def _query( msg = f"Found no results for:\n{terms}" raise NotImplementedError(msg) + @property + def _cache(self) -> Path | None: # type: ignore[return] + """ + Returns path to datasets cache, if possible. + + Requires opt-in via environment variable:: + + Reader._ENV_VAR + """ + if _dir := os.environ.get(self._ENV_VAR): + cache_dir = Path(_dir) + cache_dir.mkdir(exist_ok=True) + return cache_dir + def _import(self, name: str, /) -> Any: if spec := find_spec(name): return import_module(spec.name) @@ -205,6 +205,20 @@ def __repr__(self) -> str: def __init__(self, name: LiteralString, /) -> None: ... +class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): + def __init__(self, name: _Pandas, /) -> None: + self._name = _requirements(name) + if not TYPE_CHECKING: + pd = self._import(self._name) + self._read_fn = { + ".csv": pd.read_csv, + ".json": pd.read_json, + ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), + ".arrow": pd.read_feather, + } + self._scan_fn = {".parquet": pd.read_parquet} + + class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: _pd, _pa = _requirements(name) @@ -229,20 +243,6 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} -class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): - def __init__(self, name: _Pandas, /) -> None: - self._name = _requirements(name) - if not TYPE_CHECKING: - pd = self._import(self._name) - self._read_fn = { - ".csv": pd.read_csv, - ".json": pd.read_json, - ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), - ".arrow": pd.read_feather, - } - self._scan_fn = {".parquet": pd.read_parquet} - - class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): def __init__(self, name: _Polars, /) -> None: self._name = _requirements(name) From c572180ebc7d876714a38688c53f7e4af87abd93 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 16:59:10 +0000 Subject: [PATCH 066/137] docs: Fill out `Loader.url` --- altair/datasets/__init__.py | 40 +++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 260258882..b7f87bdaa 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -143,7 +143,6 @@ def __call__( """Get a remote dataset and load as tabular data.""" return self._reader.dataset(name, suffix, tag=tag, **kwds) - # TODO: docs (parameters, examples) def url( self, name: DatasetName | LiteralString, @@ -151,7 +150,44 @@ def url( /, tag: VersionTag | None = None, ) -> str: - """Return the address of a remote dataset.""" + """ + Return the address of a remote dataset. + + Parameters + ---------- + name + Name of the dataset/`stem`_ of filename. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + `vega-datasets release`_ version. + + .. _stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Examples + -------- + The returned url will always point to an accessible dataset: + + import altair as alt + from altair.datasets import Loader + + data = Loader.with_backend("polars") + data.url("cars", tag="v2.9.0") + 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' + + We can pass the result directly to a chart: + + url = data.url("cars", tag="v2.9.0") + alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") + """ return self._reader.url(name, suffix, tag=tag) @property From 9ab9463007a8509c25cc69665ba995f42e84792d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 18:06:03 +0000 Subject: [PATCH 067/137] feat: Adds `_Reader._read_metadata` --- altair/datasets/_readers.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 53a18b2d6..ea8d7088c 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -166,7 +166,7 @@ def query( """ source = self._metadata fn = self.scanner_from(source) - frame = nw.from_native(fn(source), pass_through=False) + frame = nw.from_native(fn(source)) result = frame.filter(_filter_reduce(predicates, constraints)) df: nw.DataFrame[Any] = ( result.collect() if isinstance(result, nw.LazyFrame) else result @@ -178,6 +178,19 @@ def query( msg = f"Found no results for:\n{terms}" raise NotImplementedError(msg) + def _read_metadata(self) -> IntoDataFrameT: + """ + Return the full contents of ``metadata.parquet``. + + Effectively an eager read, no filters. + """ + fn = self.scanner_from(self._metadata) + frame = nw.from_native(fn(self._metadata)) + df: nw.DataFrame[Any] = ( + frame.collect() if isinstance(frame, nw.LazyFrame) else frame + ) + return df.to_native() + @property def _cache(self) -> Path | None: # type: ignore[return] """ From dd3edd66e2eb38be3c73f0ad0411e738f2f81495 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 20:43:53 +0000 Subject: [PATCH 068/137] refactor: Rename `(reader|scanner_from()` -> `(read|scan)_fn()` --- altair/datasets/_readers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index ea8d7088c..afa1d2f54 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -90,11 +90,11 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" - def reader_from(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: - suffix = validate_suffix(source, is_ext_supported) + def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: + suffix = validate_suffix(source, is_ext_read) return self._read_fn[suffix] - def scanner_from(self, source: StrPath, /) -> Callable[..., IntoFrameT]: + def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) return self._scan_fn[suffix] @@ -120,7 +120,7 @@ def dataset( it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) url = result["url_npm"] - fn = self.reader_from(url) + fn = self.read_fn(url) if cache := self._cache: fp = cache / (result["sha"] + result["suffix"]) @@ -165,7 +165,7 @@ def query( https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html """ source = self._metadata - fn = self.scanner_from(source) + fn = self.scan_fn(source) frame = nw.from_native(fn(source)) result = frame.filter(_filter_reduce(predicates, constraints)) df: nw.DataFrame[Any] = ( @@ -184,7 +184,7 @@ def _read_metadata(self) -> IntoDataFrameT: Effectively an eager read, no filters. """ - fn = self.scanner_from(self._metadata) + fn = self.scan_fn(self._metadata) frame = nw.from_native(fn(self._metadata)) df: nw.DataFrame[Any] = ( frame.collect() if isinstance(frame, nw.LazyFrame) else frame @@ -356,7 +356,7 @@ def validate_constraints( constraints["suffix"] = fp.suffix return constraints elif suffix is not None: - if not is_ext_supported(suffix): + if not is_ext_read(suffix): raise TypeError(suffix) else: constraints["suffix"] = suffix @@ -377,7 +377,7 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: return suffix == ".parquet" -def is_ext_supported(suffix: Any) -> TypeIs[Extension]: +def is_ext_read(suffix: Any) -> TypeIs[Extension]: return suffix in {".csv", ".json", ".tsv", ".arrow"} From 146cb50c60d0839cf56552b00472f768ec58001c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 9 Nov 2024 21:29:44 +0000 Subject: [PATCH 069/137] refactor(typing): Replace some explicit casts --- altair/datasets/_readers.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index afa1d2f54..78e330047 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -226,7 +226,7 @@ def __init__(self, name: _Pandas, /) -> None: self._read_fn = { ".csv": pd.read_csv, ".json": pd.read_json, - ".tsv": cast(partial["pd.DataFrame"], partial(pd.read_csv, sep="\t")), + ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"), ".arrow": pd.read_feather, } self._scan_fn = {".parquet": pd.read_parquet} @@ -241,19 +241,12 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: pa = self._import(_pa) # noqa: F841 self._read_fn = { - ".csv": cast( - partial["pd.DataFrame"], partial(pd.read_csv, dtype_backend="pyarrow") - ), - ".json": cast( - partial["pd.DataFrame"], partial(pd.read_json, dtype_backend="pyarrow") - ), - ".tsv": cast( - partial["pd.DataFrame"], - partial(pd.read_csv, sep="\t", dtype_backend="pyarrow"), - ), - ".arrow": partial(pd.read_feather, dtype_backend="pyarrow"), + ".csv": partial["pd.DataFrame"](pd.read_csv, dtype_backend=_pa), + ".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa), + ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa), + ".arrow": partial(pd.read_feather, dtype_backend=_pa), } - self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend="pyarrow")} + self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)} class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): From 94ad0d1b879f43359dbead2b796db540531a2504 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 12:51:27 +0000 Subject: [PATCH 070/137] refactor: Shorten and document request delays --- tools/datasets/github.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index c2d7141aa..2d0d16fca 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -106,8 +106,10 @@ class _GitHubRequestNamespace: _UNAUTH_RATE_LIMIT: Literal[60] = 60 _TAGS_COST: Literal[1] = 1 _TREES_COST: Literal[2] = 2 - _UNAUTH_DELAY: Literal[5] = 5 - _AUTH_DELAY: Literal[1] = 1 + _UNAUTH_DELAY: Literal[5_000] = 5_000 + """**ms** delay added between **unauthenticated** ``trees`` requests.""" + _AUTH_DELAY: Literal[500] = 500 + """**ms** delay added between **authenticated** ``trees`` requests.""" _UNAUTH_TREES_LIMIT: Literal[10] = 10 def __init__(self, gh: GitHub, /) -> None: @@ -123,6 +125,10 @@ def rate_limit(self) -> GitHubRateLimitResources: content: GitHubRateLimitResources = json.load(response)["resources"] return content + def delay(self, *, is_auth: bool) -> float: + ms = self._AUTH_DELAY if is_auth else self._UNAUTH_DELAY + return (ms + random.triangular()) / 1_000 + def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" if n < 1 or n > self._TAGS_MAX_PAGE: @@ -314,6 +320,11 @@ def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: raise NotImplementedError(limit) return limit + def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float: + """Return a delay time in seconds, corresponding with authentication status.""" + limit = rate_limit or self.rate_limit(strict=True) + return self.req.delay(is_auth=limit["is_auth"]) + def tags( self, n_head: int | None = None, *, warn_lower: bool = False ) -> pl.DataFrame: @@ -412,14 +423,13 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: cost = req._TREES_COST * n if rate_limit["remaining"] < cost: raise NotImplementedError(rate_limit, cost) - delay_secs = req._AUTH_DELAY if rate_limit["is_auth"] else req._UNAUTH_DELAY print( f"Collecting metadata for {n} missing releases.\n" - f"Using {delay_secs=} between requests ..." + f"Using {self.delay(rate_limit)}[ms] between requests ..." ) dfs: list[pl.DataFrame] = [] for tag in tags: - time.sleep(delay_secs + random.triangular()) + time.sleep(self.delay(rate_limit)) dfs.append(self.trees(tag)) df = pl.concat(dfs) print(f"Finished collection.\n" f"Found {df.height} new rows") From 409338397ebb9ff2ec7abb146394f17702762b08 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 17:26:01 +0000 Subject: [PATCH 071/137] feat(DRAFT): Make `[tag]` a `pl.Enum` --- altair/datasets/_metadata/metadata.parquet | Bin 18495 -> 18641 bytes tools/datasets/__init__.py | 6 +++++ tools/datasets/github.py | 17 ++++++++++--- tools/datasets/semver.py | 28 ++++++++++++++++++--- 4 files changed, 44 insertions(+), 7 deletions(-) diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 8bf0e17e3673d2b7cfbbe1ddba345f492d12e674..5e7b3bd06439ace1d5eb8387efecde322cca91d7 100644 GIT binary patch delta 725 zcmdl#f$`!*#trL~>c24hh=zzVFfeFr)&HAtm7$fH;gbuWz=qc)p%RZu@A{rHk$(Ll zgLz$*ldAAX^S8QgF~)a3?RlNFaBlbpHYyy|qy~%HsrG>;kfK|T&s{mGC;j*9AbCgGm3!xAi?7p6yzUnRh*oWnwu!m!l4fa*^?K$I4e1O zI;J~120D5=ItG+GIXXHU06C7Sj;=r^i0@K7nZq@{J{P1V*AWOj%YlSbK8P6!BAkHQ zfux(0r=xp1P#ci%4mPeFWJahXh=dZB0YF14K>8hhL4-4y4g}F)MM0saVIe@H9MTNQo%?5#hETE|tFav@90J_G}#R%kJ aFbBj>2l+2>^K92SOdRGM3=9E|L52YRdh{Ux delta 516 zcmcaOk#YY7#trL~>OV00h=zzVFfeFr)&HAtm7$cGLCBR)V8iQ@P>DyScYRNpNWcD& z!Mx5pQZquZmnD1Qmf`LI%bA`Z;$0qC> zM3)%~2rwER%bIGq;Y$4XYcF)4v#r)kF*bBuDDvxW#3xy~O_vtW-%`q?!aCVdIm{zH zYgww-s>fRE6uGy}f4y_=>7TnT*I3T6?5V8Tne#LJ-<1DP^nbeloACef=Eud4&;Qu- zL;GK+{iFIH(*IiRAKL%m{?};#V2kzM$>){j7;kNUr_3nD*thwEZ8sC&y&gu_FmcNXmMht4J`Wa<^{|8T3~3yAOQ2n5cKlet{uCri3IY_@cpz{Ihfoq-|1F~|@AOW4Er diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 6319bd65e..f318f292e 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -88,6 +88,7 @@ def npm(self) -> Npm: return self._npm def refresh(self) -> pl.DataFrame: + """Update and sync all metadata files.""" npm_tags = self.npm.tags() self.write_parquet(npm_tags, self._paths["npm_tags"]) @@ -98,6 +99,11 @@ def refresh(self) -> pl.DataFrame: self.write_parquet(gh_trees, self._paths["gh_trees"]) return gh_trees + def reset(self) -> None: + """Remove all metadata files.""" + for fp in self._paths.values(): + fp.unlink(missing_ok=True) + def read(self, name: _PathAlias, /) -> pl.DataFrame: """Read existing metadata from file.""" return pl.read_parquet(self._from_alias(name)) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 2d0d16fca..6bde876ae 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -357,6 +357,10 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: Use known tags to discover and update missing trees metadata. Aims to stay well-within API rate limits, both for authenticated and unauthenticated users. + + Notes + ----- + Internally handles regenerating the ``tag`` enum. """ if gh_tags.is_empty(): msg = f"Expected rows present in `gh_tags`, but got:\n{gh_tags!r}" @@ -367,18 +371,23 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: TP = ReParsedTag if not fp.exists(): print(f"Initializing {fp!s}") - return self._trees_batched(_iter_rows(gh_tags, stop, TP)) + result = self._trees_batched(_iter_rows(gh_tags, stop, TP)) else: - trees = pl.read_parquet(fp) + trees = ( + pl.scan_parquet(fp) + .with_columns(pl.col("tag").cast(pl.String)) + .collect() + ) missing_trees = gh_tags.join( trees.select(pl.col("tag").unique()), on="tag", how="anti" ) if missing_trees.is_empty(): print(f"Already up-to-date {fp!s}") - return trees + result = trees else: fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) - return pl.concat((trees, fresh)) + result = pl.concat((trees, fresh)) + return result.with_columns(pl.col("tag").cast(semver.tag_enum(gh_tags))) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py index cb4c6c799..57f6d509f 100644 --- a/tools/datasets/semver.py +++ b/tools/datasets/semver.py @@ -52,6 +52,28 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: return ldf -def sort(frame: _Frame, /) -> _Frame: - """Sort ``frame``, displaying in descending release order.""" - return frame.sort(_SEM_VER_FIELDS, descending=True) +def tag_enum(frame: _Frame, /, *, col_tag: str = "tag") -> pl.Enum: + """Extract an **ascending** order ``pl.Enum`` from ``col_tag``.""" + return pl.Enum( + frame.lazy() + .pipe(sort, descending=False) + .select(col_tag) + .collect() + .get_column(col_tag) + ) + + +def sort(frame: _Frame, /, descending: bool = True) -> _Frame: + """ + Sort ``frame``, displaying in release order. + + Parameters + ---------- + descending + By default, **most recent** is first. + + Notes + ----- + Ensures pre release versions maintain order, always appearing before actual releases. + """ + return frame.sort(_SEM_VER_FIELDS, descending=descending, nulls_last=not descending) From 76cdd45af0e1dc7ac632899b3618c199be5291ee Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 19:12:55 +0000 Subject: [PATCH 072/137] fix: Handle `pyarrow` scalars conversion --- altair/datasets/_readers.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 78e330047..3b122df10 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -147,8 +147,15 @@ def url( if isinstance(url, str): return url else: - msg = f"Expected 'str' but got {type(url).__name__!r} from {url!r}." - raise TypeError(msg) + converted = nw.to_py_scalar(url) + if isinstance(converted, str): + return converted + else: + msg = ( + f"Expected 'str' but got {type(converted).__name__!r}\n" + f"from {converted!r}." + ) + raise TypeError(msg) def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] From bb7bc171a7005fd63f39b3d949902f4d553801f0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 19:15:52 +0000 Subject: [PATCH 073/137] test: Adds `test_datasets` Initially quite basic, need to add more parameterize and test caching --- tests/test_datasets.py | 45 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/test_datasets.py diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 000000000..a15fb9411 --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +import pytest +from narwhals.dependencies import is_into_dataframe +from narwhals.stable import v1 as nw + +import altair as alt # noqa: F401 +from altair.datasets import Loader + +if TYPE_CHECKING: + from altair.datasets._readers import _Backend + +backends = pytest.mark.parametrize( + "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"] +) + + +@backends +def test_loader_with_backend(backend: _Backend) -> None: + data = Loader.with_backend(backend) + assert data._reader._name == backend + + +@backends +def test_loader_url(backend: _Backend) -> None: + data = Loader.with_backend(backend) + dataset_name = "volcano" + pattern = re.compile( + rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+" + ) + url = data.url(dataset_name) + assert isinstance(url, str) + assert pattern.match(url) is not None + + +@backends +def test_loader_call(backend: _Backend) -> None: + data = Loader.with_backend(backend) + frame = data("stocks", ".csv") + assert is_into_dataframe(frame) + nw_frame = nw.from_native(frame) + assert set(nw_frame.columns) == {"symbol", "date", "price"} From ebc1bfaa0b35e554da15bab7dd7d7e2a95f17e63 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 19:31:53 +0000 Subject: [PATCH 074/137] fix(DRAFT): hotfix `pyarrow` read --- altair/datasets/_readers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 3b122df10..f58fcd56d 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -98,6 +98,10 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) return self._scan_fn[suffix] + def _response_hook(self, f): + # HACK: pyarrow wants the file obj + return f.read() + def dataset( self, name: DatasetName | LiteralString, @@ -133,7 +137,7 @@ def dataset( return fn(fp, **kwds) else: with self._opener.open(url) as f: - return fn(f.read(), **kwds) + return fn(self._response_hook(f), **kwds) def url( self, @@ -329,6 +333,9 @@ def __init__(self, name: _PyArrow, /) -> None: } self._scan_fn = {".parquet": pa_read_parquet} + def _response_hook(self, f): + return f + def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: """ From fe0ae88201cc699b32ee1e9c07b602d9d7a8d439 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 20:56:22 +0000 Subject: [PATCH 075/137] fix(DRAFT): Treat `polars` as exception, invalidate cache Possibly fix https://github.com/vega/altair/actions/runs/11768349827/job/32778071725?pr=3631 --- altair/datasets/_readers.py | 13 ++++++++----- tests/test_datasets.py | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index f58fcd56d..eea9f18db 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -99,8 +99,8 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: return self._scan_fn[suffix] def _response_hook(self, f): - # HACK: pyarrow wants the file obj - return f.read() + # HACK: `pyarrow` + `pandas` wants the file obj + return f def dataset( self, @@ -273,6 +273,9 @@ def __init__(self, name: _Polars, /) -> None: } self._scan_fn = {".parquet": pl.scan_parquet} + def _response_hook(self, f): + return f.read() + class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: @@ -289,6 +292,9 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: } self._scan_fn = {".parquet": pl.scan_parquet} + def _response_hook(self, f): + return f.read() + class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): """ @@ -333,9 +339,6 @@ def __init__(self, name: _PyArrow, /) -> None: } self._scan_fn = {".parquet": pa_read_parquet} - def _response_hook(self, f): - return f - def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: """ diff --git a/tests/test_datasets.py b/tests/test_datasets.py index a15fb9411..c37bc0046 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -39,6 +39,7 @@ def test_loader_url(backend: _Backend) -> None: @backends def test_loader_call(backend: _Backend) -> None: data = Loader.with_backend(backend) + data.cache_dir = "" frame = data("stocks", ".csv") assert is_into_dataframe(frame) nw_frame = nw.from_native(frame) From 7089f2af693c6db2025ee265f31ec4ef228dd8c3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 10 Nov 2024 21:11:07 +0000 Subject: [PATCH 076/137] test: Skip `pyarrow` tests on `3.9` Forgot that this gets uninstalled in CI https://github.com/vega/altair/actions/runs/11768424121/job/32778234026?pr=3631 --- tests/test_datasets.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index c37bc0046..ec2f9014f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -9,12 +9,15 @@ import altair as alt # noqa: F401 from altair.datasets import Loader +from tests import skip_requires_pyarrow if TYPE_CHECKING: from altair.datasets._readers import _Backend -backends = pytest.mark.parametrize( - "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"] +backends = skip_requires_pyarrow( + pytest.mark.parametrize( + "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"] + ) ) @@ -39,7 +42,7 @@ def test_loader_url(backend: _Backend) -> None: @backends def test_loader_call(backend: _Backend) -> None: data = Loader.with_backend(backend) - data.cache_dir = "" + data.cache_dir = "" # type: ignore[assignment] frame = data("stocks", ".csv") assert is_into_dataframe(frame) nw_frame = nw.from_native(frame) From e1290d4384d4926c24f22a3a23f103e284cfbe1e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 13:50:54 +0000 Subject: [PATCH 077/137] refactor: Tidy up changes from last 4 commits - Rename and properly document "file-like object" handling - Also made a bit clearer what is being called and when - Use a more granular approach to skipping in `@backends` - Previously, everything was skipped regardless of whether it required `pyarrow` - Now, `polars`, `pandas` **always** run - with `pandas` expected to fail - I had to clean up `skip_requires_pyarrow` to make it compatible with `pytest.param` - It has a runtime check for if `MarkDecorator`, instead of just a callable https://github.com/vega/altair/pull/3631/commits/bb7bc171a7005fd63f39b3d949902f4d553801f0, https://github.com/vega/altair/pull/3631/commits/ebc1bfaa0b35e554da15bab7dd7d7e2a95f17e63, https://github.com/vega/altair/pull/3631/commits/fe0ae88201cc699b32ee1e9c07b602d9d7a8d439, https://github.com/vega/altair/pull/3631/commits/7089f2af693c6db2025ee265f31ec4ef228dd8c3 --- altair/datasets/_readers.py | 33 ++++++++++++++++++++++----------- tests/__init__.py | 31 +++++++++++++++++++------------ tests/test_datasets.py | 26 ++++++++++++++++++++++---- 3 files changed, 63 insertions(+), 27 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index eea9f18db..a3435d231 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -12,6 +12,7 @@ import os import urllib.request from functools import partial +from http.client import HTTPResponse from importlib import import_module from importlib.util import find_spec from itertools import chain, islice @@ -76,6 +77,10 @@ __all__ = ["get_backend"] +def _identity(_: _T, /) -> _T: + return _ + + class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): """ Common functionality between backends. @@ -88,6 +93,18 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): _name: LiteralString _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() + _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity) + """ + Backends that do not support `file-like objects`_, must override with conversion. + + Used only for **remote** files, as *cached* files use a `pathlib.Path`_. + + .. _file-like objects: + https://docs.python.org/3/glossary.html#term-file-object + .. _pathlib.Path: + https://docs.python.org/3/library/pathlib.html#pathlib.Path + """ + _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: @@ -98,10 +115,6 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: suffix = validate_suffix(source, is_ext_scan) return self._scan_fn[suffix] - def _response_hook(self, f): - # HACK: `pyarrow` + `pandas` wants the file obj - return f - def dataset( self, name: DatasetName | LiteralString, @@ -137,7 +150,7 @@ def dataset( return fn(fp, **kwds) else: with self._opener.open(url) as f: - return fn(self._response_hook(f), **kwds) + return fn(self._response(f), **kwds) def url( self, @@ -261,6 +274,8 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): + _response = staticmethod(HTTPResponse.read) + def __init__(self, name: _Polars, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: @@ -273,11 +288,10 @@ def __init__(self, name: _Polars, /) -> None: } self._scan_fn = {".parquet": pl.scan_parquet} - def _response_hook(self, f): - return f.read() - class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): + _response = staticmethod(HTTPResponse.read) + def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: _pl, _pa = _requirements(name) self._name = name @@ -292,9 +306,6 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: } self._scan_fn = {".parquet": pl.scan_parquet} - def _response_hook(self, f): - return f.read() - class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): """ diff --git a/tests/__init__.py b/tests/__init__.py index 617cfca80..17a33e91e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -5,14 +5,14 @@ import sys from importlib.util import find_spec from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, overload import pytest from tests import examples_arguments_syntax, examples_methods_syntax if TYPE_CHECKING: - from collections.abc import Callable, Collection, Iterator, Mapping + from collections.abc import Collection, Iterator, Mapping from re import Pattern if sys.version_info >= (3, 11): @@ -20,6 +20,7 @@ else: from typing_extensions import TypeAlias from _pytest.mark import ParameterSet + from _pytest.mark.structures import Markable MarksType: TypeAlias = ( "pytest.MarkDecorator | Collection[pytest.MarkDecorator | pytest.Mark]" @@ -96,9 +97,21 @@ def windows_has_tzdata() -> bool: """ +@overload def skip_requires_pyarrow( - fn: Callable[..., Any] | None = None, /, *, requires_tzdata: bool = False -) -> Callable[..., Any]: + fn: None = ..., /, *, requires_tzdata: bool = ... +) -> pytest.MarkDecorator: ... + + +@overload +def skip_requires_pyarrow( + fn: Markable, /, *, requires_tzdata: bool = ... +) -> Markable: ... + + +def skip_requires_pyarrow( + fn: Markable | None = None, /, *, requires_tzdata: bool = False +) -> pytest.MarkDecorator | Markable: """ ``pytest.mark.skipif`` decorator. @@ -109,7 +122,7 @@ def skip_requires_pyarrow( https://github.com/vega/altair/issues/3050 .. _pyarrow: - https://pypi.org/project/pyarrow/ + https://pypi.org/project/pyarrow/ """ composed = pytest.mark.skipif( find_spec("pyarrow") is None, reason="`pyarrow` not installed." @@ -120,13 +133,7 @@ def skip_requires_pyarrow( reason="Timezone database is not installed on Windows", )(composed) - def wrap(test_fn: Callable[..., Any], /) -> Callable[..., Any]: - return composed(test_fn) - - if fn is None: - return wrap - else: - return wrap(fn) + return composed if fn is None else composed(fn) def id_func_str_only(val) -> str: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index ec2f9014f..7a4ab51f1 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from importlib.util import find_spec from typing import TYPE_CHECKING import pytest @@ -14,10 +15,27 @@ if TYPE_CHECKING: from altair.datasets._readers import _Backend -backends = skip_requires_pyarrow( - pytest.mark.parametrize( - "backend", ["polars", "polars[pyarrow]", "pandas", "pandas[pyarrow]", "pyarrow"] - ) + +requires_pyarrow = skip_requires_pyarrow() + +backends = pytest.mark.parametrize( + "backend", + [ + "polars", + pytest.param( + "pandas", + marks=pytest.mark.xfail( + find_spec("pyarrow") is None, + reason=( + "`pandas` supports backends other than `pyarrow` for `.parquet`.\n" + "However, none of these are currently an `altair` dependency." + ), + ), + ), + pytest.param("polars[pyarrow]", marks=requires_pyarrow), + pytest.param("pandas[pyarrow]", marks=requires_pyarrow), + pytest.param("pyarrow", marks=requires_pyarrow), + ], ) From 9d88e1bbb20b6b24bc3cefc40c62108e259edf65 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 14:37:21 +0000 Subject: [PATCH 078/137] refactor: Rework `_readers.py` - Moved `_Reader._metadata` -> module-level constant `_METADATA`. - It was never modified and is based on the relative directory of this module - Generally improved the readability with more method-chaining (less assignment) - Renamed, improved doc `_filter_reduce` -> `_parse_predicates_constraints` --- altair/datasets/_readers.py | 55 ++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index a3435d231..b2f41af89 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -22,6 +22,7 @@ Any, Callable, ClassVar, + Final, Generic, Literal, Protocol, @@ -76,6 +77,8 @@ __all__ = ["get_backend"] +_METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" + def _identity(_: _T, /) -> _T: return _ @@ -105,8 +108,6 @@ class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): https://docs.python.org/3/library/pathlib.html#pathlib.Path """ - _metadata: Path = Path(__file__).parent / "_metadata" / "metadata.parquet" - def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_read) return self._read_fn[suffix] @@ -159,20 +160,13 @@ def url( /, tag: VersionTag | None = None, ) -> str: - df = self.query(**validate_constraints(name, suffix, tag)) - url = df.item(0, "url_npm") + frame = self.query(**validate_constraints(name, suffix, tag)) + url = nw.to_py_scalar(frame.item(0, "url_npm")) if isinstance(url, str): return url else: - converted = nw.to_py_scalar(url) - if isinstance(converted, str): - return converted - else: - msg = ( - f"Expected 'str' but got {type(converted).__name__!r}\n" - f"from {converted!r}." - ) - raise TypeError(msg) + msg = f"Expected 'str' but got {type(url).__name__!r}\n" f"from {url!r}." + raise TypeError(msg) def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] @@ -188,15 +182,14 @@ def query( .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html """ - source = self._metadata - fn = self.scan_fn(source) - frame = nw.from_native(fn(source)) - result = frame.filter(_filter_reduce(predicates, constraints)) - df: nw.DataFrame[Any] = ( - result.collect() if isinstance(result, nw.LazyFrame) else result + frame = ( + nw.from_native(self.scan_fn(_METADATA)(_METADATA)) + .filter(_parse_predicates_constraints(predicates, constraints)) + .lazy() + .collect() ) - if not df.is_empty(): - return df + if not frame.is_empty(): + return frame else: terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) msg = f"Found no results for:\n{terms}" @@ -208,12 +201,12 @@ def _read_metadata(self) -> IntoDataFrameT: Effectively an eager read, no filters. """ - fn = self.scan_fn(self._metadata) - frame = nw.from_native(fn(self._metadata)) - df: nw.DataFrame[Any] = ( - frame.collect() if isinstance(frame, nw.LazyFrame) else frame + return ( + nw.from_native(self.scan_fn(_METADATA)(_METADATA)) + .lazy() + .collect() + .to_native() ) - return df.to_native() @property def _cache(self) -> Path | None: # type: ignore[return] @@ -351,11 +344,15 @@ def __init__(self, name: _PyArrow, /) -> None: self._scan_fn = {".parquet": pa_read_parquet} -def _filter_reduce(predicates: tuple[Any, ...], constraints: Metadata, /) -> nw.Expr: +def _parse_predicates_constraints( + predicates: tuple[Any, ...], constraints: Metadata, / +) -> nw.Expr: """ - ``narwhals`` only accepts ``filter(*predicates)`. + ``narwhals`` only accepts ``filter(*predicates)``. + + So we convert each item in ``**constraints`` here as:: - Manually converts the constraints into ``==`` + col("column_name") == literal_value """ return nw.all_horizontal( chain(predicates, (nw.col(name) == v for name, v in constraints.items())) From 60d39f5f7f175f94b2511b221ee2fd1760eacb9e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 16:40:12 +0000 Subject: [PATCH 079/137] test: Adds tests for missing dependencies --- altair/datasets/_readers.py | 14 ++++++++++- tests/test_datasets.py | 48 +++++++++++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index b2f41af89..20b308aed 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -226,7 +226,19 @@ def _import(self, name: str, /) -> Any: if spec := find_spec(name): return import_module(spec.name) else: - msg = f"{type(self).__name__!r} requires missing dependency {name!r}." + reqs = _requirements(self._name) # type: ignore[call-overload] + if isinstance(reqs, tuple): + depends = ", ".join(f"{req!r}" for req in reqs) + " packages" + else: + depends = f"{reqs!r} package" + + msg = ( + f"Backend {self._name!r} requires the {depends}, but {name!r} could not be found.\n" + f"This can be installed with pip using:\n" + f" pip install {name}\n" + f"Or with conda using:\n" + f" conda install -c conda-forge {name}" + ) raise ModuleNotFoundError(msg, name=name) def __repr__(self) -> str: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 7a4ab51f1..de932137f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +import sys from importlib.util import find_spec from typing import TYPE_CHECKING @@ -13,8 +14,12 @@ from tests import skip_requires_pyarrow if TYPE_CHECKING: + from typing import Literal + from altair.datasets._readers import _Backend +CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" + requires_pyarrow = skip_requires_pyarrow() @@ -58,10 +63,49 @@ def test_loader_url(backend: _Backend) -> None: @backends -def test_loader_call(backend: _Backend) -> None: +def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv(CACHE_ENV_VAR, raising=False) + data = Loader.with_backend(backend) - data.cache_dir = "" # type: ignore[assignment] frame = data("stocks", ".csv") assert is_into_dataframe(frame) nw_frame = nw.from_native(frame) assert set(nw_frame.columns) == {"symbol", "date", "price"} + + +@backends +def test_missing_dependency_single( + backend: _Backend, monkeypatch: pytest.MonkeyPatch +) -> None: + if backend in {"polars[pyarrow]", "pandas[pyarrow]"}: + pytest.skip("Testing single dependency backends only") + + monkeypatch.setitem(sys.modules, backend, None) + + with pytest.raises( + ModuleNotFoundError, + match=re.compile( + rf"{backend}.+requires.+{backend}.+but.+{backend}.+not.+found.+pip install {backend}", + flags=re.DOTALL, + ), + ): + Loader.with_backend(backend) + + +@pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"]) +@skip_requires_pyarrow +def test_missing_dependency_multi( + backend: _Backend, monkeypatch: pytest.MonkeyPatch +) -> None: + secondary = "pyarrow" + primary = backend.removesuffix(f"[{secondary}]") + monkeypatch.setitem(sys.modules, secondary, None) + + with pytest.raises( + ModuleNotFoundError, + match=re.compile( + rf"{re.escape(backend)}.+requires.+'{primary}', '{secondary}'.+but.+{secondary}.+not.+found.+pip install {secondary}", + flags=re.DOTALL, + ), + ): + Loader.with_backend(backend) From d6f0e45a3ade1fd9ca08e22b2ae9f6710eabd496 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 18:36:28 +0000 Subject: [PATCH 080/137] test: Adds `test_dataset_not_found` --- altair/datasets/_readers.py | 10 ++-- tests/test_datasets.py | 95 +++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 20b308aed..ebd996d65 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -192,8 +192,8 @@ def query( return frame else: terms = "\n".join(f"{t!r}" for t in (predicates, constraints) if t) - msg = f"Found no results for:\n{terms}" - raise NotImplementedError(msg) + msg = f"Found no results for:\n {terms}" + raise ValueError(msg) def _read_metadata(self) -> IntoDataFrameT: """ @@ -378,16 +378,18 @@ def validate_constraints( /, ) -> Metadata: constraints: Metadata = {} + suffixes = ".csv", ".json", ".tsv", ".arrow" if tag is not None: constraints["tag"] = tag - if name.endswith((".csv", ".json", ".tsv", ".arrow")): + if name.endswith(suffixes): fp = Path(name) constraints["dataset_name"] = fp.stem constraints["suffix"] = fp.suffix return constraints elif suffix is not None: if not is_ext_read(suffix): - raise TypeError(suffix) + msg = f"Expected 'suffix' to be one of {suffixes!r},\nbut got: {suffix!r}" + raise TypeError(msg) else: constraints["suffix"] = suffix constraints["dataset_name"] = name diff --git a/tests/test_datasets.py b/tests/test_datasets.py index de932137f..cf26fc0f8 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -109,3 +109,98 @@ def test_missing_dependency_multi( ), ): Loader.with_backend(backend) + + +@backends +def test_dataset_not_found(backend: _Backend) -> None: + """ + Various queries that should **always raise** due to non-existent dataset. + + ``Loader.url`` is used since it doesn't require a remote connection. + """ + import polars as pl + + data = Loader.with_backend(backend) + real_name: Literal["disasters"] = "disasters" + real_suffix: Literal[".csv"] = ".csv" + real_tag: Literal["v1.14.0"] = "v1.14.0" + + invalid_name: Literal["fake name"] = "fake name" + invalid_suffix: Literal["fake suffix"] = "fake suffix" + invalid_tag: Literal["fake tag"] = "fake tag" + + incorrect_suffix: Literal[".json"] = ".json" + incorrect_tag: Literal["v1.5.0"] = "v1.5.0" + + ERR_NO_RESULT = ValueError + # NOTE: ``polars`` enforces enums stricter than other packages. + # Rather than returning an empty dataframe, filtering on a value + # *outside* of the enum range raises an internal error. + ERR_NO_RESULT_OR_ENUM = (ERR_NO_RESULT, pl.exceptions.InvalidOperationError) + + MSG_NO_RESULT = "Found no results for" + NAME = "dataset_name" + SUFFIX = "suffix" + TAG = "tag" + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile(rf"{MSG_NO_RESULT}.+{NAME}.+{invalid_name}", re.DOTALL), + ): + data.url(invalid_name) + + with pytest.raises( + TypeError, + match=re.compile( + rf"Expected '{SUFFIX}' to be one of.+\(.+\).+but got.+{invalid_suffix}", + re.DOTALL, + ), + ): + data.url(real_name, invalid_suffix) # type: ignore[arg-type] + + with pytest.raises( + ERR_NO_RESULT_OR_ENUM, + match=re.compile(rf"{invalid_tag}", re.DOTALL), + ): + data.url(real_name, tag=invalid_tag) # type: ignore[arg-type] + + with pytest.raises( + ERR_NO_RESULT_OR_ENUM, + match=re.compile(rf"{invalid_tag}", re.DOTALL), + ): + data.url(real_name, real_suffix, tag=invalid_tag) # type: ignore[arg-type] + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile( + rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{SUFFIX}.+{real_suffix}.+{NAME}.+{real_name}", + re.DOTALL, + ), + ): + data.url(real_name, real_suffix, tag=incorrect_tag) + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile( + rf"{MSG_NO_RESULT}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}", + re.DOTALL, + ), + ): + data.url(real_name, incorrect_suffix) + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile( + rf"{MSG_NO_RESULT}.+{TAG}.+{real_tag}.+{SUFFIX}.+{incorrect_suffix}.+{NAME}.+{real_name}", + re.DOTALL, + ), + ): + data.url(real_name, incorrect_suffix, tag=real_tag) + + with pytest.raises( + ERR_NO_RESULT, + match=re.compile( + rf"{MSG_NO_RESULT}.+{TAG}.+{incorrect_tag}.+{NAME}.+{real_name}", re.DOTALL + ), + ): + data.url(real_name, tag=incorrect_tag) From b7d57a0b497de6bc824f3e2600894cc75f5ad413 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 19:44:28 +0000 Subject: [PATCH 081/137] test: Adds `test_reader_cache` --- tests/test_datasets.py | 74 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index cf26fc0f8..b3cd1ab8c 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -3,10 +3,10 @@ import re import sys from importlib.util import find_spec -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pytest -from narwhals.dependencies import is_into_dataframe +from narwhals.dependencies import is_into_dataframe, is_polars_dataframe from narwhals.stable import v1 as nw import altair as alt # noqa: F401 @@ -14,6 +14,7 @@ from tests import skip_requires_pyarrow if TYPE_CHECKING: + from pathlib import Path from typing import Literal from altair.datasets._readers import _Backend @@ -204,3 +205,72 @@ def test_dataset_not_found(backend: _Backend) -> None: ), ): data.url(real_name, tag=incorrect_tag) + + +@backends +def test_reader_cache( + backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """ + Using a sample of the smallest datasets, make *"requests"* that are all caught by prior hits. + + Note + ---- + `tmp_path`_ is a built-in fixture. + + .. _tmp_path: + https://docs.pytest.org/en/stable/getting-started.html#request-a-unique-temporary-directory-for-functional-tests + """ + import polars as pl + from polars.testing import assert_frame_equal + + monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) + + data = Loader.with_backend(backend) + cache_dir = data.cache_dir + assert cache_dir is not None + assert cache_dir == tmp_path + + assert tuple(cache_dir.iterdir()) == () + + # smallest csvs + lookup_groups = data("lookup_groups", tag="v2.5.3") + data("lookup_people", tag="v2.4.0") + data("iowa-electricity", tag="v2.3.1") + data("global-temp", tag="v2.9.0") + + cached_paths = tuple(cache_dir.iterdir()) + assert len(cached_paths) == 4 + + if is_polars_dataframe(lookup_groups): + left, right = ( + lookup_groups, + cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")), + ) + else: + left, right = ( + pl.DataFrame(lookup_groups), + pl.DataFrame(data("lookup_groups", tag="v2.5.3")), + ) + + assert_frame_equal(left, right) + assert len(tuple(cache_dir.iterdir())) == 4 + assert cached_paths == tuple(cache_dir.iterdir()) + + data("iowa-electricity", tag="v1.30.2") + data("global-temp", tag="v2.8.1") + data("global-temp", tag="v2.8.0") + + assert len(tuple(cache_dir.iterdir())) == 4 + assert cached_paths == tuple(cache_dir.iterdir()) + + data("lookup_people", tag="v1.10.0") + data("lookup_people", tag="v1.11.0") + data("lookup_people", tag="v1.20.0") + data("lookup_people", tag="v1.21.0") + data("lookup_people", tag="v2.1.0") + data("lookup_people", tag="v2.3.0") + data("lookup_people", tag="v2.5.0-next.0") + + assert len(tuple(cache_dir.iterdir())) == 4 + assert cached_paths == tuple(cache_dir.iterdir()) From b70aef883721ce1ce905e1ec8e82938eb4859257 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 21:23:43 +0000 Subject: [PATCH 082/137] docs: Finish `_Reader`, fill parameters of `Loader.__call__` Still need examples for `Loader.__call__` --- altair/datasets/__init__.py | 31 +++++++++++++++++++--- altair/datasets/_readers.py | 52 +++++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 23 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index b7f87bdaa..4260314d1 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -131,7 +131,7 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: obj._reader = get_backend(backend) return obj - # TODO: docs (parameters, examples) + # TODO: docs (examples) def __call__( self, name: DatasetName | LiteralString, @@ -140,7 +140,30 @@ def __call__( tag: VersionTag | None = None, **kwds: Any, ) -> IntoDataFrameT: - """Get a remote dataset and load as tabular data.""" + """ + Get a remote dataset and load as tabular data. + + Parameters + ---------- + name + Name of the dataset/`stem`_ of file name. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + **kwds + Arguments passed to the underlying read function. + + .. _stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + """ return self._reader.dataset(name, suffix, tag=tag, **kwds) def url( @@ -156,14 +179,14 @@ def url( Parameters ---------- name - Name of the dataset/`stem`_ of filename. + Name of the dataset/`stem`_ of file name. suffix File extension/`Path.suffix`_. .. note:: Only needed if ``name`` is available in multiple formats. tag - `vega-datasets release`_ version. + Version identifier for a `vega-datasets release`_. .. _stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index ebd996d65..fe8f8212f 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -23,7 +23,6 @@ Callable, ClassVar, Final, - Generic, Literal, Protocol, TypeVar, @@ -84,16 +83,42 @@ def _identity(_: _T, /) -> _T: return _ -class _Reader(Generic[IntoDataFrameT, IntoFrameT], Protocol): +class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): """ - Common functionality between backends. + Describes basic IO for remote & local tabular resources. - Trying to use ``narwhals`` as much as possible + Subclassing this protocol directly will provide a *mostly* complete implementation. + + Each of the following must be explicitly assigned: + + _Reader._read_fn + _Reader._scan_fn + _Reader._name """ _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] + """ + Eager file read functions. + + Each corresponds to a known file extension within ``vega-datasets``. + """ + _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] + """ + *Optionally*-lazy file read/scan functions. + + Used exclusively for ``metadata.parquet``. + + Currently ``polars`` backends are the only lazy options. + """ + _name: LiteralString + """ + Used in error messages, repr and matching ``@overload``(s). + + Otherwise, has no concrete meaning. + """ + _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity) @@ -124,16 +149,6 @@ def dataset( tag: VersionTag | None = None, **kwds: Any, ) -> IntoDataFrameT: - """ - Fetch a remote dataset, attempt caching if possible. - - Parameters - ---------- - name, suffix, tag - TODO - **kwds - Arguments passed to the underlying read function. - """ df = self.query(**validate_constraints(name, suffix, tag)) it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) @@ -171,13 +186,12 @@ def url( def query( self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] ) -> nw.DataFrame[IntoDataFrameT]: - r""" + """ Query multi-version trees metadata. - Parameters - ---------- - \*predicates, \*\*constraints - Passed directly to `pl.LazyFrame.filter`_. + Notes + ----- + Arguments correspond to those seen in `pl.LazyFrame.filter`_. .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html From 403b7874f360fc2f1734de538e81a91e4c4ddffe Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 11 Nov 2024 21:48:21 +0000 Subject: [PATCH 083/137] refactor: Rename `backend` -> `backend_name`, `get_backend` -> `backend` `get_` was the wrong term since it isn't a free operation --- altair/datasets/__init__.py | 14 ++++++------- altair/datasets/_readers.py | 40 +++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 4260314d1..b6f983754 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -4,7 +4,7 @@ from narwhals.typing import IntoDataFrameT, IntoFrameT -from altair.datasets._readers import _Reader, get_backend +from altair.datasets._readers import _Reader, backend if TYPE_CHECKING: import sys @@ -46,29 +46,29 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): @overload @classmethod def with_backend( - cls, backend: Literal["polars", "polars[pyarrow]"], / + cls, backend_name: Literal["polars", "polars[pyarrow]"], / ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... @overload @classmethod def with_backend( - cls, backend: Literal["pandas", "pandas[pyarrow]"], / + cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / ) -> Loader[pd.DataFrame, pd.DataFrame]: ... @overload @classmethod def with_backend( - cls, backend: Literal["pyarrow"], / + cls, backend_name: Literal["pyarrow"], / ) -> Loader[pa.Table, pa.Table]: ... @classmethod - def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: + def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: """ Initialize a new loader, with the specified backend. Parameters ---------- - backend + backend_name DataFrame package/config used to return data. * *polars*: Using `polars defaults`_ @@ -128,7 +128,7 @@ def with_backend(cls, backend: _Backend, /) -> Loader[Any, Any]: dtype: object """ obj = Loader.__new__(Loader) - obj._reader = get_backend(backend) + obj._reader = backend(backend_name) return obj # TODO: docs (examples) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index fe8f8212f..9645d0bb2 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -74,7 +74,7 @@ _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow] -__all__ = ["get_backend"] +__all__ = ["backend"] _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" @@ -428,33 +428,35 @@ def is_ext_read(suffix: Any) -> TypeIs[Extension]: @overload -def get_backend(backend: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... +def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... @overload -def get_backend(backend: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ... +def backend(name: _PandasAny, /) -> _Reader[pd.DataFrame, pd.DataFrame]: ... @overload -def get_backend(backend: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ... - - -def get_backend(backend: _Backend, /) -> _Reader[Any, Any]: - if backend == "polars": - return _PolarsReader(backend) - elif backend == "polars[pyarrow]": - return _PolarsPyArrowReader(backend) - elif backend == "pandas[pyarrow]": - return _PandasPyArrowReader(backend) - elif backend == "pandas": - return _PandasReader(backend) - elif backend == "pyarrow": - return _PyArrowReader(backend) - elif backend in {"ibis", "cudf", "dask", "modin"}: +def backend(name: _PyArrow, /) -> _Reader[pa.Table, pa.Table]: ... + + +def backend(name: _Backend, /) -> _Reader[Any, Any]: + """Reader initialization dispatcher.""" + if name == "polars": + return _PolarsReader(name) + elif name == "polars[pyarrow]": + return _PolarsPyArrowReader(name) + elif name == "pandas[pyarrow]": + return _PandasPyArrowReader(name) + elif name == "pandas": + return _PandasReader(name) + elif name == "pyarrow": + return _PyArrowReader(name) + elif name in {"ibis", "cudf", "dask", "modin"}: msg = "Supported by ``narwhals``, not investigated yet" raise NotImplementedError(msg) else: - raise TypeError(backend) + msg = f"Unknown backend {name!r}" + raise TypeError(msg) @overload From 3fbc759233fdf0203a2f8685245152732f57276a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 00:04:49 +0000 Subject: [PATCH 084/137] fix(DRAFT): Add multiple fallbacks for `pyarrow` JSON --- altair/datasets/_readers.py | 62 ++++++++++++++++++++++++++++++++----- tests/test_datasets.py | 40 +++++++++++++++++++++++- 2 files changed, 94 insertions(+), 8 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 9645d0bb2..0f30e58b9 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -11,6 +11,7 @@ import os import urllib.request +from collections.abc import Mapping, Sequence from functools import partial from http.client import HTTPResponse from importlib import import_module @@ -34,6 +35,7 @@ from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT if TYPE_CHECKING: + import json # noqa: F401 import sys from urllib.request import OpenerDirector @@ -346,25 +348,71 @@ class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): def __init__(self, name: _PyArrow, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: - pa = self._import(self._name) # noqa: F841 + pa = self._import(self._name) pa_csv = self._import(f"{self._name}.csv") pa_feather = self._import(f"{self._name}.feather") - pa_json = self._import(f"{self._name}.json") pa_parquet = self._import(f"{self._name}.parquet") - pa_read_csv = pa_csv.read_csv pa_read_feather = pa_feather.read_table - pa_read_json = pa_json.read_json pa_read_parquet = pa_parquet.read_table - # opt1 = ParseOptions(delimiter="\t") # type: ignore + # HACK: Multiple alternatives to `pyarrow.json.read_json` + # ------------------------------------------------------- + # NOTE: Prefer `polars` since it is zero-copy and fast (1) + if find_spec("polars") is not None: + import polars as pl + + def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: + return pl.read_json(source).to_arrow() + + else: + import json + + def stdlib_read_json(source: Any, /, **kwds) -> pa.Table: + if not isinstance(source, (Path)): + obj = json.load(source) + else: + with Path(source).open(encoding="utf-8") as f: + obj = json.load(f) + # Very naive check, but still less likely to fail + if isinstance(obj, Sequence) and isinstance(obj[0], Mapping): + return pa.Table.from_pylist(obj) + else: + # NOTE: Almost certainly will fail on read as of `v2.9.0` + pa_json = self._import(f"{self._name}.json") + return pa_json.read_json(source) + + # NOTE: Use `pandas` as a slower fallback (2) + if find_spec("pandas") is not None: + import pandas as pd + + def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: + try: + table = ( + nw.from_native( + pd.read_json( + source, dtype_backend="pyarrow" + ).convert_dtypes(dtype_backend="pyarrow") + ) + .with_columns( + nw.selectors.by_dtype(nw.Object).cast(nw.String) + ) + .to_arrow() + ) + except ValueError: + table = stdlib_read_json(source) + return table + else: + # NOTE: Convert inline from stdlib json (3) + pa_read_json = stdlib_read_json + # Stubs suggest using a dataclass, but no way to construct it - opt2: Any = {"delimiter": "\t"} + tab_sep: Any = {"delimiter": "\t"} self._read_fn = { ".csv": pa_read_csv, ".json": pa_read_json, - ".tsv": partial(pa_read_csv, parse_options=opt2), + ".tsv": partial(pa_read_csv, parse_options=tab_sep), ".arrow": pa_read_feather, } self._scan_fn = {".parquet": pa_read_parquet} diff --git a/tests/test_datasets.py b/tests/test_datasets.py index b3cd1ab8c..e39497fb4 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -17,7 +17,8 @@ from pathlib import Path from typing import Literal - from altair.datasets._readers import _Backend + from altair.datasets._readers import _Backend, _Pandas, _Polars + from altair.datasets._typing import DatasetName CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" @@ -274,3 +275,40 @@ def test_reader_cache( assert len(tuple(cache_dir.iterdir())) == 4 assert cached_paths == tuple(cache_dir.iterdir()) + + +@pytest.mark.parametrize( + "dataset", + [ + "cars", + "movies", + "wheat", + "barley", + "gapminder", + "income", + "burtin", + pytest.param( + "earthquakes", + marks=pytest.mark.xfail( + reason="GeoJSON seems to not work with pandas -> pyarrow" + ), + ), + ], +) +@pytest.mark.parametrize("fallback", ["polars", "pandas", None]) +@skip_requires_pyarrow +def test_pyarrow_read_json( + fallback: _Polars | _Pandas | None, + dataset: DatasetName, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv(CACHE_ENV_VAR, "") + + if fallback == "polars" or fallback is None: + monkeypatch.delitem(sys.modules, "pandas", raising=False) + elif fallback == "pandas" or fallback is None: + monkeypatch.setitem(sys.modules, "polars", None) + + data = Loader.with_backend("pyarrow") + + data(dataset, ".json") From 4f5b4de6d894a1297bd2edfaecb72c5eefa48bc7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 13:56:07 +0000 Subject: [PATCH 085/137] test: Remove `pandas` fallback for `pyarrow` There are enough alternatives here, it only added complexity --- altair/datasets/_readers.py | 40 ++++++++++---------------------- tests/test_datasets.py | 46 ++++++++++++++++++++++--------------- 2 files changed, 40 insertions(+), 46 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 0f30e58b9..2e20fd375 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -366,46 +366,30 @@ def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: return pl.read_json(source).to_arrow() else: + # NOTE: Convert inline from stdlib json (2) import json - def stdlib_read_json(source: Any, /, **kwds) -> pa.Table: - if not isinstance(source, (Path)): + pa_json = self._import(f"{self._name}.json") + + def pa_read_json(source: Any, /, **kwds) -> pa.Table: + if not isinstance(source, Path): obj = json.load(source) else: with Path(source).open(encoding="utf-8") as f: obj = json.load(f) - # Very naive check, but still less likely to fail + # NOTE: Common case of {"values": [{...}]}, missing the `"values"` keys if isinstance(obj, Sequence) and isinstance(obj[0], Mapping): return pa.Table.from_pylist(obj) + elif isinstance(obj, Mapping) and "type" in obj: + msg = ( + "Inferred file as geojson, unsupported by pyarrow.\n" + "Try installing `polars` or using `Loader.url(...)` instead." + ) + raise NotImplementedError(msg) else: # NOTE: Almost certainly will fail on read as of `v2.9.0` - pa_json = self._import(f"{self._name}.json") return pa_json.read_json(source) - # NOTE: Use `pandas` as a slower fallback (2) - if find_spec("pandas") is not None: - import pandas as pd - - def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: - try: - table = ( - nw.from_native( - pd.read_json( - source, dtype_backend="pyarrow" - ).convert_dtypes(dtype_backend="pyarrow") - ) - .with_columns( - nw.selectors.by_dtype(nw.Object).cast(nw.String) - ) - .to_arrow() - ) - except ValueError: - table = stdlib_read_json(source) - return table - else: - # NOTE: Convert inline from stdlib json (3) - pa_read_json = stdlib_read_json - # Stubs suggest using a dataclass, but no way to construct it tab_sep: Any = {"delimiter": "\t"} diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e39497fb4..01167cf10 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -11,21 +11,23 @@ import altair as alt # noqa: F401 from altair.datasets import Loader -from tests import skip_requires_pyarrow +from altair.datasets._typing import DatasetName if TYPE_CHECKING: from pathlib import Path from typing import Literal - from altair.datasets._readers import _Backend, _Pandas, _Polars - from altair.datasets._typing import DatasetName + import polars as pl + from _pytest.mark.structures import ParameterSet + + from altair.datasets._readers import _Backend, _Polars CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" -requires_pyarrow = skip_requires_pyarrow() +requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow() -backends = pytest.mark.parametrize( +backends: pytest.MarkDecorator = pytest.mark.parametrize( "backend", [ "polars", @@ -277,36 +279,44 @@ def test_reader_cache( assert cached_paths == tuple(cache_dir.iterdir()) +movies_fail: ParameterSet = pytest.param( + "movies", + marks=pytest.mark.xfail( + reason="Only working for `polars`.\n" + "`pyarrow` isn't happy with the mixed `int`/`str` column." + ), +) +earthquakes_fail: ParameterSet = pytest.param( + "earthquakes", + marks=pytest.mark.xfail( + reason="Only working for `polars`.\n" "GeoJSON fails on native `pyarrow`" + ), +) + + @pytest.mark.parametrize( "dataset", [ "cars", - "movies", + movies_fail, "wheat", "barley", "gapminder", "income", "burtin", - pytest.param( - "earthquakes", - marks=pytest.mark.xfail( - reason="GeoJSON seems to not work with pandas -> pyarrow" - ), - ), + earthquakes_fail, ], ) -@pytest.mark.parametrize("fallback", ["polars", "pandas", None]) +@pytest.mark.parametrize("fallback", ["polars", None]) @skip_requires_pyarrow def test_pyarrow_read_json( - fallback: _Polars | _Pandas | None, + fallback: _Polars | None, dataset: DatasetName, monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setenv(CACHE_ENV_VAR, "") - - if fallback == "polars" or fallback is None: - monkeypatch.delitem(sys.modules, "pandas", raising=False) - elif fallback == "pandas" or fallback is None: + monkeypatch.delitem(sys.modules, "pandas", raising=False) + if fallback is None: monkeypatch.setitem(sys.modules, "polars", None) data = Loader.with_backend("pyarrow") From 69a72b6e32625687223987d04e3c3f925421c1ab Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 13:59:05 +0000 Subject: [PATCH 086/137] test: Adds `test_all_datasets` Disabled by default, since there are 74 datasets --- pyproject.toml | 8 ++++++-- tests/test_datasets.py | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4132f0a25..2297ca2ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -420,10 +420,14 @@ docstring-code-line-length = 88 # They contain examples which are being executed by the # test_examples tests. norecursedirs = ["tests/examples_arguments_syntax", "tests/examples_methods_syntax"] -addopts = ["--numprocesses=logical"] +addopts = [ + "--numprocesses=logical", + "-m not datasets_debug" +] # https://docs.pytest.org/en/stable/how-to/mark.html#registering-marks markers = [ - "slow: Label tests as slow (deselect with '-m \"not slow\"')" + "slow: Label tests as slow (deselect with '-m \"not slow\"')", + "datasets_debug: Disabled by default due to high number of requests" ] [tool.mypy] diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 01167cf10..d3f7625cd 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -3,15 +3,15 @@ import re import sys from importlib.util import find_spec -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, cast, get_args import pytest from narwhals.dependencies import is_into_dataframe, is_polars_dataframe from narwhals.stable import v1 as nw -import altair as alt # noqa: F401 from altair.datasets import Loader from altair.datasets._typing import DatasetName +from tests import skip_requires_pyarrow, slow if TYPE_CHECKING: from pathlib import Path @@ -47,6 +47,27 @@ ], ) +datasets_debug: pytest.MarkDecorator = slow(pytest.mark.datasets_debug) +""" +Custom ``pytest.mark`` decorator. + +Use for more exhaustive tests that require many requests. + +**Disabled** by default in ``pyproject.toml``: + + [tool.pytest.ini_options] + addopts = ... +""" + + +@pytest.fixture(scope="session") +def polars_loader( + tmp_path_factory: pytest.TempPathFactory, +) -> Loader[pl.DataFrame, pl.LazyFrame]: + data = Loader.with_backend("polars") + data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars") + return data + @backends def test_loader_with_backend(backend: _Backend) -> None: @@ -322,3 +343,13 @@ def test_pyarrow_read_json( data = Loader.with_backend("pyarrow") data(dataset, ".json") + + +@datasets_debug +@pytest.mark.parametrize("name", get_args(DatasetName)) +def test_all_datasets( + name: DatasetName, polars_loader: Loader[pl.DataFrame, pl.LazyFrame] +) -> None: + """Ensure all annotated datasets can be loaded with the most reliable backend.""" + frame = polars_loader(name) + assert is_polars_dataframe(frame) From 08101cc33aa1d08f25323ea1de161c6863f30ceb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 14:07:42 +0000 Subject: [PATCH 087/137] refactor: Remove `_Reader._response` Can't reproduce the original issue that led to adding this. All backends are supporting `HTTPResponse` directly --- altair/datasets/_readers.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 2e20fd375..65df737e8 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -13,7 +13,6 @@ import urllib.request from collections.abc import Mapping, Sequence from functools import partial -from http.client import HTTPResponse from importlib import import_module from importlib.util import find_spec from itertools import chain, islice @@ -81,10 +80,6 @@ _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" -def _identity(_: _T, /) -> _T: - return _ - - class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): """ Describes basic IO for remote & local tabular resources. @@ -123,17 +118,6 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() - _response: ClassVar[staticmethod[[HTTPResponse], Any]] = staticmethod(_identity) - """ - Backends that do not support `file-like objects`_, must override with conversion. - - Used only for **remote** files, as *cached* files use a `pathlib.Path`_. - - .. _file-like objects: - https://docs.python.org/3/glossary.html#term-file-object - .. _pathlib.Path: - https://docs.python.org/3/library/pathlib.html#pathlib.Path - """ def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: suffix = validate_suffix(source, is_ext_read) @@ -168,7 +152,7 @@ def dataset( return fn(fp, **kwds) else: with self._opener.open(url) as f: - return fn(self._response(f), **kwds) + return fn(f, **kwds) def url( self, @@ -295,8 +279,6 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - _response = staticmethod(HTTPResponse.read) - def __init__(self, name: _Polars, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: @@ -311,8 +293,6 @@ def __init__(self, name: _Polars, /) -> None: class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - _response = staticmethod(HTTPResponse.read) - def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: _pl, _pa = _requirements(name) self._name = name From 90428a625bc3928684018d57861f608574812fd8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 15:49:33 +0000 Subject: [PATCH 088/137] fix: Correctly handle no remote connection Previously, `Path.touch()` appeared to be a cache-hit - despite being an empty file. - Fixes that bug - Adds tests --- altair/datasets/_readers.py | 4 ++-- tests/test_datasets.py | 47 ++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 65df737e8..57b290c32 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -143,11 +143,11 @@ def dataset( if cache := self._cache: fp = cache / (result["sha"] + result["suffix"]) - if fp.exists(): + if fp.exists() and fp.stat().st_size: return fn(fp, **kwds) else: - fp.touch() with self._opener.open(url) as f: + fp.touch() fp.write_bytes(f.read()) return fn(fp, **kwds) else: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index d3f7625cd..1b866cf58 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -2,8 +2,10 @@ import re import sys +from functools import partial from importlib.util import find_spec -from typing import TYPE_CHECKING, cast, get_args +from typing import TYPE_CHECKING, Any, cast, get_args +from urllib.error import URLError import pytest from narwhals.dependencies import is_into_dataframe, is_polars_dataframe @@ -353,3 +355,46 @@ def test_all_datasets( """Ensure all annotated datasets can be loaded with the most reliable backend.""" frame = polars_loader(name) assert is_polars_dataframe(frame) + + +def _raise_exception(e: type[Exception], *args: Any, **kwds: Any): + raise e(*args, **kwds) + + +def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + from polars.testing import assert_frame_equal + + data = Loader.with_backend("polars") + data.cache_dir = tmp_path + + data("londonCentroids") + data("stocks") + data("driving") + + cached_paths = tuple(tmp_path.iterdir()) + assert len(cached_paths) == 3 + + raiser = partial(_raise_exception, URLError) + with monkeypatch.context() as mp: + mp.setattr(data._reader._opener, "open", raiser) + # Existing cache entries don't trigger an error + data("londonCentroids") + data("stocks") + data("driving") + # Mocking cache-miss without remote conn + with pytest.raises(URLError): + data("birdstrikes") + assert len(tuple(tmp_path.iterdir())) == 3 + + # Now we can get a cache-hit + frame = data("birdstrikes") + assert is_polars_dataframe(frame) + assert len(tuple(tmp_path.iterdir())) == 4 + + with monkeypatch.context() as mp: + mp.setattr(data._reader._opener, "open", raiser) + # Here, the remote conn isn't considered - we already have the file + frame_from_cache = data("birdstrikes") + assert len(tuple(tmp_path.iterdir())) == 4 + + assert_frame_equal(frame, frame_from_cache) From 8ad78c174933c9b728f30db653354da6aff64f23 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:26:41 +0000 Subject: [PATCH 089/137] docs: Align `_typing.Metadata` and `Loader.(url|__call__)` descriptions Related https://github.com/vega/altair/commit/c572180ebc7d876714a38688c53f7e4af87abd93 --- altair/datasets/__init__.py | 8 ++++---- altair/datasets/_typing.py | 24 +++++++++++++++--------- tools/datasets/__init__.py | 28 +++++++++++++++++++--------- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index b6f983754..d6acbf4c2 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -146,7 +146,7 @@ def __call__( Parameters ---------- name - Name of the dataset/`stem`_ of file name. + Name of the dataset/`Path.stem`_. suffix File extension/`Path.suffix`_. @@ -157,7 +157,7 @@ def __call__( **kwds Arguments passed to the underlying read function. - .. _stem: + .. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix @@ -179,7 +179,7 @@ def url( Parameters ---------- name - Name of the dataset/`stem`_ of file name. + Name of the dataset/`Path.stem`_. suffix File extension/`Path.suffix`_. @@ -188,7 +188,7 @@ def url( tag Version identifier for a `vega-datasets release`_. - .. _stem: + .. _Path.stem: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem .. _Path.suffix: https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 0a86bc6ba..ed9ca99a6 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -149,16 +149,16 @@ class Metadata(TypedDict, total=False): Parameters ---------- dataset_name - Equivalent to ``Pathlib.Path.stem``. + Name of the dataset/`Path.stem`_. ext_supported Dataset can be read as tabular data. file_name - Equivalent to ``Pathlib.Path.name``. + Equivalent to `Path.name`_. name_collision - Dataset is available via multiple ``suffix``(s). + Dataset is available via multiple formats. .. note:: - Requires specifying a preference in calls to ``data(ext=...)``. + Requires specifying a preference in calls to ``data(name, suffix=...)`` sha Unique hash for the dataset. @@ -169,14 +169,20 @@ class Metadata(TypedDict, total=False): size File size (*bytes*). suffix - File extension. - - .. note:: - Equivalent to ``Pathlib.Path.suffix`` + File extension/`Path.suffix`_. tag - ``vega-datasets`` release version. + Version identifier for a `vega-datasets release`_. url_npm Remote url used to access dataset. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.name: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases """ dataset_name: str diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index f318f292e..5e2ca1dd7 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -158,28 +158,38 @@ def generate_typing(self, output: Path, /) -> None: NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" name_collision = ( - f"Dataset is available via multiple ``suffix``(s).{NOTE_SEP}" - "Requires specifying a preference in calls to ``data(ext=...)``." + f"Dataset is available via multiple formats.{NOTE_SEP}" + "Requires specifying a preference in calls to ``data(name, suffix=...)``" ) sha = ( f"Unique hash for the dataset.{NOTE_SEP}" f"If the dataset did *not* change between ``v1.0.0``-``v2.0.0``;\n\n{indent * 3}" f"then all ``tag``(s) in this range would **share** this value." ) + links = ( + f".. _Path.stem:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem\n" + f".. _Path.name:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.name\n" + f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n" + f".. _vega-datasets release:\n{indent * 2}https://github.com/vega/vega-datasets/releases" + ) + descriptions: dict[str, str] = { - "dataset_name": "Equivalent to ``Pathlib.Path.stem``.", + "dataset_name": "Name of the dataset/`Path.stem`_.", "ext_supported": "Dataset can be read as tabular data.", - "file_name": "Equivalent to ``Pathlib.Path.name``.", + "file_name": "Equivalent to `Path.name`_.", "name_collision": name_collision, "sha": sha, "size": "File size (*bytes*).", - "suffix": f"File extension.{NOTE_SEP}Equivalent to ``Pathlib.Path.suffix``", - "tag": "``vega-datasets`` release version.", + "suffix": "File extension/`Path.suffix`_.", + "tag": "Version identifier for a `vega-datasets release`_.", "url_npm": "Remote url used to access dataset.", } - metadata_doc = f"\n{indent}".join( - f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" - for param in metadata_schema + metadata_doc = ( + f"\n{indent}".join( + f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" + for param in metadata_schema + ) + + f"\n\n{links}" ) contents = ( From e6504546f89831930168e6bcaa7150f690ef4709 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:14:32 +0000 Subject: [PATCH 090/137] feat: Update to `v2.10.0`, fix tag inconsistency - Noticed one branch that missed the join to `npm` - Moved the join to `.tags()` and added a doc - https://github.com/vega/vega-datasets/releases/tag/v2.10.0 --- altair/datasets/_metadata/metadata.parquet | Bin 18641 -> 19128 bytes altair/datasets/_typing.py | 1 + tools/datasets/_metadata/tags.parquet | Bin 6200 -> 6247 bytes tools/datasets/_metadata/tags_npm.parquet | Bin 2596 -> 2597 bytes tools/datasets/github.py | 43 ++++++++++++++------- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 5e7b3bd06439ace1d5eb8387efecde322cca91d7..969f64b18f44b812f11e0e1f34a58c6b592c994a 100644 GIT binary patch delta 7562 zcmb7J1z1#1zdpNkFR=?t=Mo|zNP{%ef`BLu0*Z7fQfEOLq*IWRk`NF@8l+1>KtPag zr9)EoBL3_9?!Di0@7a0I&Y5TC-PzwebDsB|-=|nebt0tP7D~{@2_Glsg`e@pJ40Du zbNCEMKo5ll0Kge91n*O2HdGr520*eYe9Q&EL>g31>ZmZ^Q07uh!Yg4hGK%9TjVotu z4&ri4V6o>-ND4h6m9ICt@=kym=o<%ojD*bfR+$s%sGOZW#oYn4(n&a5o!odJ96Hq9Yp590$)P6c78=IGSoicTgB0G8T9cQqBwd%VjU2g#ttQ zc|Y>=>|)I^u-3Ofk01m)&aNrA&Uy*|2{`%0L_k%!aAm^b1xZd*rG}i%?umJfs zGQsamK!Ux2J;bK`ZhvZr0Vi(b5v{ zfJ;MoKI3e^R~e1_6$(2iGmhk9im;0*#-GX2BYw|r4*w?-P~`fH+4yWIWo0vg z(QkIqmambeHg&T%m6&fRa@|b+o82qu5WVqcSrH1g@1ZA9`L7NXKuos>gg^ifZ;20Z zfv^WB+Quf5?CscE;}ENVAc8Fuv69YOgNCP$x8V0kk!T`0aUkePKps*QCPcDl zOlqW-F>2X0h%Qi5U+pkAWQmCt81K#5a26>bSu&G{=l(@4fz5q;>woepi-<&`rs7ZK z7){S8dzE7Jrw7A3Wz|4I8lnK`jaYT1tN`5jZxyTnqk#L*sl6!T($_ zDHKoDfpB^ccGB`ZEOwCQDcg6~Rtv&0d8h(Z9_pWfBAjOr(Tz^Ol}FX;XxLrKsMRlg zE55fuN~3Iy|g)A znGi8@@;F5P zvjklF6=~lqTKfo(Pi2qsb`2f~;p%wqO?GkzGAHvy-4-4nCcG&*n)dFrHSG+!=e~Ip z#!&Esw-vm*xYhTPVF>vKR7g2C4NcN262FeE z`&pT1nuI_qUD$WTpIh>U{>Ip<2d%)}^jxJqqpTTc{|0U0NXo{PVxciEY!fzOo4d*{9ZkA!98Hu&M%+-_HucF~E&u6GOR3uOsc@*5BV?kDdg@^> zNWtIjl`d_@@kTRqyokfoJ>BdyZbQ-8*8U?|t#4_qMB&&EziCcC({9E#m5v`3xhX#^ zq}br#qS4TjRld=1x?rNh&v{Z^OF7ebaN2FgsHx*iNpwysre}O4MvsWKbcq zYRl>_6Opu=T#i!pTkj_ZVZ&-|#Nv&~_V#S^gZjD95f90*&`YYSsSnenaIYC*OK6L; z=N1vL7R7v*zkJ*Vxuj)Qq)Gjx)o4@WA?H0YaJDa0p7=uu&HIsHYXHE8Y?p=KpI z6e>Qf;<%BYQ_8!9C)(TUW>coHOyS!+u-Hy%SeWR zNdI{mCc+=Zb)&D>xenW^2j2atawLzMfRaB)l&$RwHC%uLfATq<4wz*focw~hf}^wn z#1w|Ju+N{3C=p=*2mFjh6fK`^HQ|D z3q%^I4iebz@5HLz`O*^{UzuGc{gQ*kfJm4fDL(&Jd}B)UIrRW{nn=8U<<^vCD|6x@ zHp>`2H8k!!LZM3sX#uCu7YJ=ieB#*|SCMc+DecJyWGJ0miey$Cjx zN#UmjzM3ae9RwPFT5p4Mg|GQDa7fW#XTm;knw#GnA^rb5_o1Moa6-+$- zsbdO~O)sSd@tT&__me9Iv>vW4+&Vd1dII~o4vHsLZ@4>eQ9W|D21uk$IDZLwQ2CdGnHq|m0mhb zaZma=bf__Eao)<figbv$e?$oHSao$UhdaygooRXPng?+?LC@}-t=ZNmLG*rw<{yt?kKG$ zQL*Y(LBT=_{hOaU8zV$5o``Aj-!G|p^HEdjb?I%cZ=YFfHV$$tDT-C#D&F8%B7^|{ zy;iJau2R~`X{HfA3Y`5oC5x!JxK5ocDXo_0TH|n>0b6 z_H}E8&g4~gi({5`j$-KU`hKlLSr(>7^#hLIQpwAkRe@R~H;JveEj}64xVDy-GpJA; z`-UV@BSlHpdkztYZ2^qDDQ-%9i2a;sVYlfWiKP8e7NPiOI|4E{U3aFuTJeI1I|<$y zzE`bvftMgS+C2xH+6*eIDI6eRVy11j?i1~NRJeYX3ciFL)pc@j8fEc>QJ&5HNw+@z z%J11hdEH@mle8aW_vG4;s>mP8_0>c@L4IsdD5yve8i*_b)xN4Qlm?F(+&{n=7FZ%S zuZB?}{x|4ceKBcMBpuN8CL(hxB`nKie%E@d z8ZT*T%X5Zo{_tL3B=R)>s{YvzH^m92FVbpHN8((kHGOO!!miC*U9GB?xTNU0N*Yw2 zt7-98jE(C~;^gq|fWe#uYthoxMW=8ayCMAkmaF_HvG}nAavP}c`s-5ZS9(ORNbsb; zxuu*f-1@}6nti?5e^#Oaj7zY!mK^>pmpY*)-|E=f8!Ngx7w)jIX*?3{Qvdi->Ze|| zxH`pY#v@!0_C>J}63|xAU%G!cr!&1BqqV~KA^2|0WeK-Jaw+*Nm`{N0=}amqb0Y*m z1I$nW8^8wyFp!vH00@>$Vv}Yf!o_0~ z%zRn+FR_Q^cF*Fsd|!KnI2OT1m555k&mauxehDrT_-h;u_{iUa%Mj{I2n1H%qho!qQEFzp@ok40DOqEiYZgTGukNBElx_-}3f z%O?DZL4&XPo4#N73o~#*mHYolRoI2({>>6Wo@Gs;(7&^!{oh$q;ZfurAd4XZMt3yk zxVhwj)J8s8e_EC)nk=NlmB8tlFB`fv{WGLSi$wgi9byRkn3>a`!Dn4c9WN{s;E6R3aMx-Fcw9H1hA<2UVEd_>Li7nm+Xl=`d1l!Cu z`tMi1-B6YloDsesOw#to|0`Rkxmt!Ccc>As18&?LG=d6|FwdgVTw$p(p+Z%@k>>F5 zOIBeDGT_Ys9QzNah3$6V>ko&ub`?oXW{DV^ytr+%3^&c1ghdBxnnLdU@(~CR!gCF^4!Mq;o9#S_o;BZZ3eZ6pO3r= zd^$6o!y0*LtpE`O1OWj+(}2FAs-M9$E$L*Tp*Y1P$iC4u%jh0~`05k@B_~N=EnL7M zYA$90WnuhU$t%Q%qBB&20d1oKBF!WkEzK`UQuC4xU`WWJb1W0nNL6J|=!2*T0+GZP z3{g8jWsN>=#|YAY&&`uVtQ&(Do;J1MOE zySqNkoX4bQk1*Z$t$SFn_>K1i)lhRz<3^FyHr0!dN86nbIP|wYiUKyD@?={PPEk9` zzglGgn6Y^ZVa>z5R?e5~v0yNN9bA_*NL4t_{l#y+phPJ%IpW2kH48*cm*sKtI=mQh zH-yo)K=aowJ%zx}F5F$bG#e}E_ZAD73!!IEo^_sv4jw@ zf{l6=brDD0hgbN-`2=Y29t!da3h)VB95IZQ(!#6nUA*rR>?zmm)v*BHFd$sTm{`bJ za}aWR@h31Clk&pY28ZscwF#1=xHTa=emll#j?;)oX%GW8DM2c~v#-NI zB^3ar%K7|>$+QOJsWq&>XDkhBnN53aQhQSE^eX(7PWmm1d=%yS;tZT*Uf1Q+rh8B$ZjcflK zf*^=Mft&@OG$_#G2!j=}SJ?XDd;DYUT?8^!b%zqBJmUk!hY~RKR*S+ilzp^tjOj*Sn0oWLq)saJ`Dlq-px147{3X^ikdd;cH*a!R;wp>9K%7u2!B#zI_jEN(cL=Bl2tR5~D z7vZT&8b991(bP8Lwa3fTM;R_B6cT>#eFP2usP+Lj2wyGd>s(HhRj5H{I|QOj)1#Ff zUDNeE?5JK}&uuP?`?%~bT!Qgo_fcu}^z)h~=$-Bu%wUu2y|E&s5I(cfbadi2AR*kw z*te?wWw(A?71!TGi|BiW{APf_iC$p?1AptOQ>VW+I~tGv#A0r4KyT`i-WDAq(*qzlUWHeB9fE2 zs`Qo2zqJK(8l!iv&%_c#aEplSYMeaSG-KS(u8GLrB2r;7s0kOj%Qzk0c2AC^f=P{O zl6S>%mif$4CodoOijDQ~GZr7YV{#M|cd$$D5qNt`iBOCNrnsew73?rKEe4ZhXezwh zt|3{g&P@gXusP36iAuP^+^5bTV2Mp}Pxboa)j)Y++Rnm!*^XiA?zHVUXs6rc$1;At zq-Pj%U)B$j*YRby+oYx!T;$FQJg-E`FtIRxW6cj}KM|WPgD=l7Oui5GZ529xS1Sq{}QEOj)OT6#&5r5r73Q>0%hxAkD^l9Vaaew1yiNxv1XUhtD$c0PR zQ!=rEDbW$NRr)N^kI@wSXQ+L~-T`lBJWKE~URQQA-wI9zcMpe__cXrtd6u8b*4&Eb@nurUWxYlRc5jk9 zCNt%Vt~hN+%xiurSZw&Bf0Qx1wC5XCpONo7I9e79PuzUw7f;PJo;G#$rf%eHT1V=T z7L0xn6i8cnGt4y2^o;i>k~iVnvoh|eMeSVb_V-uBA6&~!AnhZi?u=SZl{e@aV;n>^}cwgqXy1GD>iLw#?mv zx9Zg8Zk?E!db-syzgUN;L4I_y%;M?HWYm;E;v4ym^1}qrEQWAQly<^3gc+!r6h5lK zv|LPiyeIiH!Sr&|Vf2D_eS)VbDGI}M_{?H-KsO!hlZ^~swgO|$fq8<@y(6(ku$&h| zL&M$lY?>=-kt~L~<0Mb~fpkqGY2R~dvZ#zw2z`*5vSjg~_afHH-obY6MgvQFO%gM{kR z_NLL}B4_e2lW7S-gsY4LG;Y0UyUPOu)7lua<*1RY!MvHxw;5`gBIpF=-3Ypj%}MfXIx6vwoT{iJu6d;QVC*CtMrCMlULL0AtxaEMH9oZ#hL5joH+5LacI@oHV!N zaA&R$z2DxgeiFPjriyP5zW43&e0aOs>a4)W>6`5HI_I!8)pVm{pRd~5mzW|ek}!CI zlLwoJoE81$dyU1eI9)qiTa|0|EHU($%TJ!mUXOVA^!>!#N$F`9eN#-JOjC*{=Xp5U zT8XTsMx0Om2IgOhf4}~4U9oyn#)XjU{g{okyZ4!43V%2gwI0MY7hF1ea8x$e4?KGG z)erOB`1+|^@<#mEdVy3Cr&6q98GcYH1fsDp;9k*BK z+#jA``!3y`3%{G8pz=+VN$2$H3S+b@IR1m)QCF649vJNdZdGFZ3;!&6~CG@8-EB%QG9rz4U>!zu0rP%eWkBEP_#OWc$fy zmQaG;KuAAN<1%A0NER0EPxzU-L7Mq4n=@J?BPNH>L?lpA@b-(nVNcF&@NQL@nL^SG z?0QlTRt9MI73p zj^iu0rA;8%7v@9v+NgCA3F6#eAn6fv)bK3mAN4~H_|Bj0OTzFK$RF)3Ch!H=AN3gz z_yXA<^{NmEJOTHQhmIHseBe){<9G=C&@Mhow*KZBFOnXaj@m)c|0T2kKUt{rBorbc z6bh$7^!P*JA^<9Ymi|`~NagP&5cNM24v3PAzUTlboE1QQ6Z-o{|M&a|6It8Epq%qT z$^h!4NIZ``t-}AD&_H~>=(_S7Ki~0h{C~6Le>!pyv3D^d{0S6(el~(?3xd%9CzJS> zNBrff|7oxhkzv20&=m-U%K)gJr{^$#X7&D)ZL#6=f5}EXZOrGGjn#|Gs$^p_H`O)C>d0Fn}Dap=!-}-q>(1CS_X0c1tzJGcr_;)54JGqeLWPM z$ONb%3UPBMip!!>^?aa89GRg;3IJ#2H0ZiNhm7pdP2YWt=w;Ca~| zZ*$Gv=Ap0hY#F7ttq`{Xxbhug%{`JnT1I+j#n-A)U)r$W3&fR>igrdY8(A(%GhtOZ z@G1xbCQEmmniYH(W+$yeRiZcj!ZYb9$RG_q7~fi{nlvxCr|%0z_>b4d+B z{6he~APaBCj1w}GrV_%ILFW&uM9JW$3z&id6Ud(jRRRG3>B;Fyi!sPZ9|;1VAoXcM zgeNnXTP~k<&uFh}s;epzo}l6QF=_lr3;hhOupo%|WCqO(f+CH0rifZMKakWq%#5oH zGuvrcCNS#jf!tS@L!XvDS%?H-LWvm3@OLvv2rnW))z=gfO`Svk?KkvMq%DULJ`|q6{CV#WsgP?`J~*d(WH71Rq*M-;lp!RtF08i1VkO zd}*~^7-Ru3onkdhPK`Q1<8RZQyVWz(*Y}#?y z+};8FHqac@ZoEyych*7(I*vAq5aY5sajcPIn!wU2R<)U=;APg;0Xe zGNS64kAt0iOInj2B5Iso%t7;rfM`QT zx8>7%XRyp(FlMH~A`TZ6T=AG)tM(X^OeuoKyPt)?u^?26q`Wl13j#6X7ZiX&U^rb^ zDVIKM0tCa$ymcbSHcd39+Q0$zI0sebQ!p2rF`jePv)z{L3oij5{+kgQ&G2^5e{;zk zb8TTf-0&!HC52I)*ZbCKF2A5mZw+DGm#8h_=|-*hwMoHObGYX%_#`kmGK)Q$4wu&R0$tQ zt=mjHyL2JR5}OZYBcKb$onok8;!wIf5>4tE9{(Ns$ML4RF*@XL+l>J0hNu#OaQv~t;m_F^M@Xe6e1;G(F* z?b=?}OK&fOAI4#7qk_rlS%fS1q91y+F?jh9F-MTIOl%uh&wtQd|3d#bYov+8{xrtA zFgi)$b6&%W+SVigdv+aNwndEf#4lcdJDhEpaJlsPx-A$COQ?hj?`5J_awfx{+zD@u z`@s9@B{_m0ot@@EqsuoPDUlh~nM-=037Zn8FnF##g9h)Ul z>{*J=>9k1|sxOfiGp|ZSUme}Do({;HHAqP9glVRjuKeifUJ$*p63FM8ePy1tGd_|* z(btkG%py9Iy?ZYu-o^e#LyXD023H;$4{kob5m!w2?Cn_PCBDdT6DbvSnCHz>jXYVK z&K1476+3s|)yMDGAe1~gh$~U~`%um=6O|M|2(sSGRs`NiC;Y|~EokD?d?G@W^+Z`r zr2Qvt9N%@WnSP55D=z(}sOdOp-wl&@sYvhpuIk63=?JTYu#WWFUnhn950gG%it@09 z{u4O`a$^^Rf*$jvYbd2NdUH5?+P+q}V%(;&n7X2LaO-Qu`)X~H8Cpj(ZXKSSPkmba zP7VUhmp?w7_*IlHX@(XWOtN2y8~gS;K*q;Px#aGI{CzNAl;pJvdok{Yjpe)BfL79Moq`$ySTvaVPVtBh30-4a`Pq& zzdX(;NaW`D`-%;rt{@Qj`Z(XCRP^a}4MNR&+nxQHKD$E7;*^=7(wr{(yBd2*D}Qm1 zy=CLzAEfj+sA|8C9*mGFx;%fqc-&M>$$NiiBU3&2>6m1qq>AHifg%@io?e0QNgCH) z9EsU6-SH}24S)Y9tEEtx?8C5>;Ww%Qoy2KTF*1u7n{s`+I;HQj9g&||1%>I!s80fM zC2X^E*;pT56(9v}GwR%yl*jkesc{ACqJZWsv5h^sR({OB$3uTzy=%ZRiqA=aH$Hh;=~iG}&oO*PtL zwqdJke$g4LipgJO2YK0}H)d~t%)02Hw`{M_X_QV2cS#eFnIfC?;L4x5|Ce8~$H-uL zoLi`&;Bsr2+;tKAD`wx6IM3;u6pJsOKfPdDdUb1lni^<+#Vfa*CL*w>n=j zjw3$SylY)^i_Qgv#$DGCB29i@{KzzLl9u=C&^7q>gY3mQxtg(juV_V^EE z+mnwLu$>C&~ZQsZGj%m4sZ)IU@IMKkX2qE0wQTwym5q`?j5uQ?o-* zV(OfDlfk8-@V#96m!zL$B3|q}Z;ky7;=hf=9+dg*>yfeFtt$wqFkt!U5ckZX+&R?8 zODv2#wsJh_>Qy)Np-*HV*(EBM(RG8&R|vIVagq}7fQzI>Mg^$~83_1eE&Dk-53s*u zR#wn!QmjlW*P~q5L=UTK-e>PytAm@r_*)4d_3}7puF2lJK~MO3DJiXW{(_l*`+TTU z+BQ(?X^sO5Auer=ul4nuU9|N8<_t(K__9TAow>>in zd}^A1e^bu)vet>et5dF=ez+QjZ{$@}0Uhr8- z)UY(BocRXh6G~Vap+0Q%A68>TaFYx$so`K@2RWai2RM~NjoD_Yp>5QI54#f@#%G8! zuyRzrlF>EJE^C~qG7j5%{eQ<4289v9h*)@|zhORb zMp5J8(!d%#LjT<&0p(5Y+`;Dt8~QA|rfpw7<*SEmz4IH6V~+9zdnK}j@c9vPBVGL_DD-3gGt`hj zbQR*|Co*+p_a2bisxelyJL8)PIbiibR=; zVRMt+<&vf5jm6h-Hgv|gBUPF{_4;1rDxZhSWpYi7-1M)!$3=xv_6?E46t1N99o-m) z8l9&%lj&p~Gk4#155ljT{*ycqdHFaxI{E&Nef;+KkQntX40N_25Mgg$Z$>ncu&t-3 z+x>HrUq%eu0qMg`{^t0ffCwjf{NTMO_d(-6l@n$lmzog||GW^f!7YKInJEalf;y?#H1q0Q@EXa^a zkc22$LNvLG-GCFx{61X0U6{I2{>N5jU%T+W4$-AudTp?&S!9|`>}Pjs1`j$kERrD- z5y_m$n!$$dQQFEl;z9SeY`qF>4(vLT$dfwl&)Z%lVA3h?LegC4;M_O`wK8^a_r_%sAH7=uA;Fi_&}bYlzw8@4|WXEN?g1)LNZV2K7| zv`i?(+zp07N2li>V)n_N>3Vy&Uq2rP>f9?)@*LMm$%I>yhhtA=HZekJ9N$k*Nsepg zxPL;?Zg*Yx-Af{2-hR2$!dnbVi!9z62x$GsqBSB*`f05$OT(iez(o)67CmNhLA% zXCZV2n_u@vS5Tg*zcDiWLi_B%4@CQ6{?OsV>p5xJW`pfS+Vx{>wzie$ak@n*&kVWw z>9a3~vd3!fd6Dggl8^y(ZGSQj=eGPknAejTF)gl(z}+y=y1;XB(jBKIe@Y)T6IC%^ zve1Y9<;;lkP!~xK?DUdTwkQusKvh)sbwWqv%E(32dLz|e13?QqrfXD0WngFY zt6m!RB(E5Bbwv3$j3WCRVd?P6X4YsAF_nef`M7f{*~OKLU%u~5;IPK=Pf;Ii;dBWD?+a+mB1*c}!$z7g z_a@1-jAHAD7jM;?oX2T{Yyj9PYCdqHJEA8a`%eAdMuf>$+lYyP8>r;x(M53PGc?)#k|%4J_|79I&r zb2f!jqx~^6kOEgCEz_UT^vlX{&Hxzb`GlgSfy?SS5I4nG7V%4|4Qq#B#Y2oIrh7SwRj~aQu_mZY2P{ zWm@2uf}572l9ES#W08eut*X8Jp58k0U@ULn$b0$;S3r`_0$W-fPovWIn9nj_GE2+u zqtv^i1J$_e&?mYoZ+Ub50~ama-f@j1?+boR+X?ZGv0`nAF%(?yHz2$H!L571FsV%% zL(Mf#xHp+BSwhY23U(Uq23xnkbbMQcM#{Q-t>bPrtEl?&)2f+?Uf5!@V#td#Tp~|BRluWn4nRthMK@V6h%u~K1-54lXrd%lh z(yepxh6z<_yM7fo8(Q19CdVw8|Y|keu8~{yYx|-%zh7e%fW{1ZMNAq@c!R#Jx zN#ZLn%K#sla7V;;`F2S!1Lv`S&;GGzv)=3sLrd;*=bp$4(XvFaVh_xW8?|}oBX4^j zja8qEA(Hw^i}kKEJ>``0wzY?+kN3gD<|;pbjfw6gWl#Mk`U8)7sMZ2UJSeL{;_p z(}{ac{vvr1HlPifVA) zaN0Phcm_*eAk{<#H7DNuJ3nQn-{P&v!OEm+sxP^&OV~oEn;=;l<|$O?U%*%8fb-1+l>XKPk>x zlK1ZkAH4b8&%qH0rU;;ZMC0B3tFZr{xcS91a{f0rhkwR9VS>Lt2EoJuRC(gRIr(3i zDWmwtxxsh5fh>S3+B-M+zhj9B9usypQNalaA3XpZo6mxO{vZ$~r8o4!8h%vVwDKxnhJ?TbL z&)k-mu1Hdiq+XQhrg#-9l_Jlko9BI>_kGX#oO6DkbH2at|9AfX@8|pfRWn~Ob=}yj zSr8Jsq5{1KS3n#H007gfxf9%8U>^doyOycDVEO6ZPfJw*0!S?;R6nhpc@!#U5eV_D zMxTQ$_czBGoIPGI)>1E3q-#svs1xhc#_;N8{m%`n7VR)nh5*PN3NVxN5+D?WA|C*u zObH(3i0`Nx&S0!~Y7JpY3jq{?L_~!1wnA8#1B)5ZS9Kd<6cv5b4C>c!`ZpsTe zXEk@VVZwdW_JY0#yZx6xdf5LVC7DH^UHHVSfbIKaZaEDbU2^2qx!SdJ3dpf}!yEUe z5%YJIm1rpKO>#3Z+ft1?2OsuoaWxhMM@JL~(eORxIv48(snu4zyPx};IMl;@f5#=e zNX-XJX-K+-*k~0`xi9@jXVGt`HVH`=`}cQ68Up~^VZ7vswEOlOYOO~MluPyGv5Xfd z){Ry0BFk4C+S_0L+&smxK_haa?xu^!jHaxXEOI`BysGy>n0a^ zwz|4VO)g$bxi?>5zhN>|jmqx)($qLV-s%9q(9gbDml}f^q^Y}vA2$53EA5x9mrjT1 z8*@ZX9Wf)%oQSJT{XGrnN78eiaxZK&Zdqz_*(R23Wam6uw4y60AoJBf6a0i4^f%Na z8m?#OEMqomZvUiXY-8U^r#-2RS?;)q(K5buJ%0C%LGC+e&LDg4?%4KYl_wYsmwV(q zKi3h9lVwQf{r&tHQnb@PeU>%x_WXtKX06&fdya~ePvoGF8+ko0{^?eR!{?r+qsWt*l~i zjEt#AHHUg+&8iC}_KZ*Zgx{QKTM)JR-f~~ zN5=trj;HG;Hp>Lg018mU%`4a8%4>u5ZAcMY|fbL&SAdJtOGJy;|xJ!c~L0XG@ zknTdcKQ%Z&`Go0iU_da|17-xb;-NU&z!hY3oI~c#JsJnvj9fJ584Q_!0v-cB0Bp~W zxO3u6<#QQSUQV*)Qp0qthJGv39P7V+QiA$*Za~jqV9^cZ>%;wdTexSkXzGH%RKlV7 z*_Qml4Zmd2!gAdT^j8}S6YisF$lK?~+On})qFABne*Y0P^kLxE94%pHQjMf&z<)qh zf88>Z6mOJpTl$dbtzBsVcT-ubOCm4%!NY(u|CeK@tz|)fG}CWO-z^UwfE-*`^yccCywyqmThAL%2c|bBMpp^!TFgU)8)Ug>mtpMJNP{$?wHh z#_s^=z3M2Y^U*6hSlf&^A+Q#FFwsG+y6b>u*a4rJM`RWwma;K=a_MqIp zrd@6vh~KxTXC&%Mj-Ic7yR7A+w5T>>jF;V3WS_C7Eq0~}W(bose95CJrsYE|pMUlq zvL4e>EPPH*JeG|07Vte5uSoKu%?N66i_&yhFC}<&EgIs_&UkKJ*XWd>)xRb8c%(~C zO<@$jcZ8us5mb5=Nm4Wd>uZ~WqhyXECzYRGp*IFEILdI$@v&H!T~4q_{O%P=d7T+r zEZ0eDh3zGUSBx5#VTem-iY(fcCsh9x=oGQ_^(%wTWEkGNMV%#nt3KnaDG6Sh^tx1SudBfRNudJ>?)?K!{)};TQ;4 zgOp;X*c0m|M_<6wZ8)GZ?j>;zT0fCs*F-Rgi#}T=ZJXnp*s@T%*6x(z-83Tq;BQ1V zNVr6a(mYRmS&JmL#hJ;;f^c*?2Si|Of#B6jdVta@kO08bstDzUZ{|-`@e+hPK*}TW zb=+dr?|ZiSDzTe3Y&F#u=&5-CfU;a)%5hY406_dGKrK|@>n-pN^yP*5PB9$-;Zl%t z)WZ5}Iz~Q$vmvIu6A(w^b4J z5d`Fyd^Jcg0FcOVz7iw_<)I>$X+am1zQrY`&8H8B~rdKQV2gS9QW~WVLH=)D3yBI#?I9u@XAM~ZlvH>c z@Nb<>9hNEJ)z;7@5TMA+#5_$M3rUJ_7Du` zvmqOR@=jc+rLD?=r%7UlFv~$s*fsTCx@a2Te~4hAwK-slM#|?xaS+j8pNczNs^G}A zQ;c!T@+Zgt5`pQre6hQ>x~tsHVfg-3e(_@R|5bpZ5D=8Fy?p2{MQWBkWB|aDXqZDm M!!lNdR9l4j6FMyk+5i9m delta 2375 zcmai#eK=Hk8^?dgjA<~&4D&n9Ov50?+jtqIL}kVnN@$y?cE#98Wm2eQWf1bTy>Zm4 zBy37l7l|Z^G&Z!7x1CUumn>EmZLXqrSheh_JdA-8b@sKdoHTHgRM`G57c+GudH6e<)SrOSyB%VZ|h%-#kWJ_ zuLi9mr`C4F*MIx+$`?`Z_848|H-Fc**sgthNDVJCg2Rv*OL}V$vCiqn1!OppnRi;% zdTLo5;kLx)hZ&3REAz+1}+`{kk-*TT!;X^k5bCpIh--#OR2VPD}=;n%rg@zRu`2BlYz zUyg1)V?BTU!_)b;*1AzY4kA9WDcS4nR=jOXA8zeEylM3XGT+VRy*N#;%z$?PeSpr} zWu33HGK)MlNCv(?oDQWNVOmDkWcvSV0VA#j-lRJI;zIF$Dv*S4?~HUfJw{x>R&~3G ziH`TRG-mJ0(PD;|)y4mZb?s-PArZcP#O%ho^unYUPhvuQ<4Zz|J8u{2I#++y5M4_5 zKGG%JR4aHv(K=IW6|k#PN(%^ZbbDYwmG?*?*lX%h(INI>%jcEcetmCSY3b^mHmOfZnX8_1AI&l!}zf0_N;`3B?a4Q~!#qKvOk{?#S;w-TnS zbN}#)4iA09_K&h~j+@gcX(w6`l{2n?-O-k_PR3G_#Jo004IWr<>49=YT)cVuB=JsV za;tvhODvA)ff!+f#55!cJEF8mfSBe`hbY!@j8%b0c@Nd{=QkqSlWG7T?%Bke>{c^T z&{d!)97MFWqp1sqiH0WgKq3_YpA`8|G7B~~f)~$_t@(GdLjy%KWS6kfd@3gwOIP%k zObFnx+FZKJ)WmFXc4s6RMw<*&;A^A5NUBSuCO7K2s9Wm3wiyQ z|4GB+B|;aP4JS-Fo4tY*naZ;FtbkL%`=DrS^GB+1O3&G3>@j#AZYyr%@(^atwOtg80oMe*s5nC78MU*6Tn1rpyNZi3X$cUghm+b1!{ zL_98?Q1!ulEnQ2=gLsbk7nxehT{C3jr^&>U)KMiNn&X7#(>N7azIr`IBN-q<%${Vr zP>3LUhy{T7*}1GJ7VQWQnc+7JK}!%$tvPm-v_-BSB$+N3B8sX%rV5BUaTGxU1?kv% zt>~1S(Za~<1=;GB@zd|-7;WD*YgCFya}_v4JZxBlfd#8_vpv|dbwWDlq5>kUxk#2L*t^LQgBeVniNJ$LSidOkp;D zWB@PXXX8Zq{GAQ^EMfj9`NKu<!m<9}a%C8zG6AHcS$1#<;IzRx!vCFBo7K>fX>pq!EJpxm!~{F3KL2m{JT^&m zlnYXq!Am&6iRH0AU-v)5!%#&t93hI9JHX|Da~f;VpeYN{IZ!Z-il+@HXTHm9=J@>+ z1`Dj}r`4tWX}94ef2sea+4J4|rmL8k$*Wz;`qvKI+W2E{+PTUizQ>VQurU;O40#iLRxjtZHVGc{Cig!%c2 zxZF>wUXyor({D{t2GIi1CPp1m#;S>%wSdGu1Ie1_wW2(drVMNXk`l}%iRq#|VpU@A zc*N#RzQ`&$*@01=v1)P#qpnI7BclnU8-pkdR4r?6Vpe_;SoI=Cu^o(RO{_MP^BE;3 zUu5(EYLQ~n^{irK737?o?4`MMN)VT%GYc>v7%J{pzpr`S^M3F5xM-YVuZQ#fU1gCML04jA~1G zZOqa%IM_gr0s;vR2n`~E!K5T=q63UNpa~!|7&I6eB^W_cpomlu>||hI0&=Pt*?}Y{ z*ldQ${OpqKRbmlhS!!P<`>`xyOq=|XMbzdBljsFTF`zO@8L>tSK&xA#l7{t~{h*ha^h#lGN I&wiB&05aa^ng9R* delta 884 zcmZuuZAcVB7@pa^-8;`a+tr(CH#fSZ*2}u{lv4~2QG`r0x~?eW`k``SN~u`LrbQwW zB#K>gNTfv^St*r2xi4>Y;5*&U&5csD+!*=Zy2?yq#XXbt1hj*T*$MVEt z_2hcfkPfXG(F>{wDF^^ydz1R7WC)h)K-m}{uv%`h&-#56xdn5>uljvO_^PKaWw5$n zwQKjZeIe4&{&{+?Y`rfvGk>hpHU1%TwwD7O<$lhMd~xX?BYCg2#n2WwXKbZ8SIPNa z=f{rg%k3w=bGe7xd@nD&3p&jAq`Tp&j9~huaz2>~)tolxt=Dx?2%uWj438is5Ke(g zVBRTST)c@)Vg_J2QJ}AtRv;5r@QevRAU2o-6;cW92NVd?;W2=8(K_ZzX;n?#Mr8=% z0Vp>yZc+;cG6j#TEs2ypUx5t2B%LWd)D!fM3rVU1U}W`MKP%u@pK z(oUm-IK(1W%Jh>q`XG~oe#0#Cmfl~i;ARTXLOEo1XZmdhR+~p%g5lq(MOB+58g6;To|%ORb#vsXIV;8y2l>(d=a z9Zo@CpXNw&I-CID+ZH6?Fpsi;JitoZl-}9F6#U47F#f@#Y#=YhY;V}XnmuM5*7*D4 zwekVd#9CsuY#|TX#JCA9y|j^Di<8JOEpY!^K}uMQDWcUqP0(n*k~n>0a#Yh@+P9)M Y;5BWuNCHX%xKqH2Y{2hdd-R{EzgUUmzyJUM diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 6bde876ae..3e57cd469 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -326,10 +326,33 @@ def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float: return self.req.delay(is_auth=limit["is_auth"]) def tags( - self, n_head: int | None = None, *, warn_lower: bool = False + self, + n_head: int | None = None, + *, + npm_tags: pl.DataFrame | pl.LazyFrame | None = None, + warn_lower: bool = False, ) -> pl.DataFrame: + """ + Get release info, enhance with `SemVer`_ context. + + Parameters + ---------- + n_head + Limit to most recent releases. + npm_tags + Used to remove any github-only releases. + warn_lower + Emit a warning if fewer than ``n_head`` tags were returned. + + .. _SemVer: + https://semver.org/#semantic-versioning-200 + """ tags = self.req.tags(n_head or self.req._TAGS_MAX_PAGE, warn_lower=warn_lower) - return pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) + frame = pl.DataFrame(self.parse.tags(tags)).pipe(semver.with_columns) + if npm_tags is not None: + return frame.lazy().join(npm_tags.lazy().select("tag"), on="tag").collect() + else: + return frame def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: """Retrieve directory info for a given version ``tag``.""" @@ -394,29 +417,23 @@ def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: npm_tag_only = npm_tags.lazy().select("tag") fp = self._paths["tags"] if not limit["is_auth"] and limit["remaining"] <= self.req._TAGS_COST: - return ( - pl.scan_parquet(fp).join(npm_tag_only, on="tag", how="inner").collect() - ) + return pl.scan_parquet(fp).join(npm_tag_only, on="tag").collect() elif not fp.exists(): print(f"Initializing {fp!s}") - tags = ( - self.tags().lazy().join(npm_tag_only, on="tag", how="inner").collect() - ) + tags = self.tags(npm_tags=npm_tag_only) print(f"Collected {tags.height} new tags") return tags else: print("Checking for new tags") prev = pl.scan_parquet(fp) - latest = ( - self.tags(1).lazy().join(npm_tag_only, on="tag", how="inner").collect() - ) + latest = self.tags(1, npm_tags=npm_tag_only) if latest.equals(prev.pipe(semver.sort).head(1).collect()): print(f"Already up-to-date {fp!s}") return prev.collect() print(f"Refreshing {fp!s}") prev_eager = prev.collect() tags = ( - pl.concat((self.tags(), prev_eager), how="vertical") + pl.concat((self.tags(npm_tags=npm_tag_only), prev_eager)) .unique("sha") .pipe(semver.sort) ) @@ -434,7 +451,7 @@ def _trees_batched(self, tags: Iterable[str | ParsedTag], /) -> pl.DataFrame: raise NotImplementedError(rate_limit, cost) print( f"Collecting metadata for {n} missing releases.\n" - f"Using {self.delay(rate_limit)}[ms] between requests ..." + f"Using {self.delay(rate_limit):.2f}[ms] between requests ..." ) dfs: list[pl.DataFrame] = [] for tag in tags: From 72296b0e630dad0d2d7c397c6e4887d74c537846 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:25:13 +0000 Subject: [PATCH 091/137] refactor: Tidying up `tools.datasets` --- tools/datasets/github.py | 31 ++++++++++++++----------------- tools/datasets/semver.py | 19 ++++++++----------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 3e57cd469..385ac1079 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -13,6 +13,7 @@ from typing import IO, TYPE_CHECKING, Any, ClassVar, Literal, TypeVar, cast import polars as pl +from polars import col from tools.datasets import semver from tools.datasets.models import ( @@ -171,9 +172,9 @@ def _request(self, url: str, /, *, raw: bool = False) -> Request: See `Media types`_. .. _personal access token: - https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens + https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens .. _Media types: - https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types + https://docs.github.com/en/rest/using-the-rest-api/getting-started-with-the-rest-api?apiVersion=2022-11-28#media-types """ headers: MutableMapping[str, str] = {"X-GitHub-Api-Version": self._VERSION} if tok := os.environ.get(self._ENV_VAR): @@ -267,7 +268,6 @@ class GitHub: https://docs.github.com/en/rest/git/trees?apiVersion=2022-11-28#get-a-tree .. _rate_limit: https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user - """ _opener: ClassVar[OpenerDirector] = urllib.request.build_opener(_ErrorHandler) @@ -359,17 +359,16 @@ def trees(self, tag: str | ParsedTag, /) -> pl.DataFrame: trees = self.req.trees(tag) tag_v = self.parse.tag_from_str(tag) if _is_str(tag) else tag["tag"] parsed = self.parse.trees(trees, tag=tag_v) + url = pl.concat_str( + pl.lit(self._npm_cdn_url), + col("tag"), + pl.lit(f"/{_DATA}/"), + col("file_name"), + ) df = ( - pl.DataFrame(parsed) - .lazy() - .with_columns(name_collision=pl.col("dataset_name").is_duplicated()) + pl.LazyFrame(parsed) .with_columns( - url_npm=pl.concat_str( - pl.lit(self._npm_cdn_url), - pl.col("tag"), - pl.lit(f"/{_DATA}/"), - pl.col("file_name"), - ) + name_collision=col("dataset_name").is_duplicated(), url_npm=url ) .collect() ) @@ -397,12 +396,10 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: result = self._trees_batched(_iter_rows(gh_tags, stop, TP)) else: trees = ( - pl.scan_parquet(fp) - .with_columns(pl.col("tag").cast(pl.String)) - .collect() + pl.scan_parquet(fp).with_columns(col("tag").cast(pl.String)).collect() ) missing_trees = gh_tags.join( - trees.select(pl.col("tag").unique()), on="tag", how="anti" + trees.select(col("tag").unique()), on="tag", how="anti" ) if missing_trees.is_empty(): print(f"Already up-to-date {fp!s}") @@ -410,7 +407,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: else: fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) result = pl.concat((trees, fresh)) - return result.with_columns(pl.col("tag").cast(semver.tag_enum(gh_tags))) + return result.with_columns(col("tag").cast(semver.tag_enum(gh_tags))) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py index 57f6d509f..f18e1e992 100644 --- a/tools/datasets/semver.py +++ b/tools/datasets/semver.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Literal import polars as pl +from polars import col if TYPE_CHECKING: from typing import TypeVar @@ -24,14 +25,14 @@ CANARY: Literal["--canary"] = "--canary" -def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: +def with_columns(frame: _Frame, /, *, tag: str = "tag") -> _Frame: """ Extracts components of a `SemVer`_ string into sortable columns. .. _SemVer: https://semver.org/#backusnaur-form-grammar-for-valid-semver-versions """ - fields = pl.col(_SEM_VER_FIELDS) + fields = col(_SEM_VER_FIELDS) pattern = r"""(?x) v?(?[[:digit:]]*)\. (?[[:digit:]]*)\. @@ -39,12 +40,12 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: (\-(next)?(beta)?\.)? (?[[:digit:]]*)? """ - sem_ver = pl.col(col_tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) + sem_ver = col(tag).str.extract_groups(pattern).struct.field(*_SEM_VER_FIELDS) ldf = ( frame.lazy() .with_columns(sem_ver) .with_columns(pl.when(fields.str.len_chars() > 0).then(fields).cast(pl.Int64)) - .with_columns(is_pre_release=pl.col("pre_release").is_not_null()) + .with_columns(is_pre_release=col("pre_release").is_not_null()) ) if isinstance(frame, pl.DataFrame): return ldf.collect() @@ -52,14 +53,10 @@ def with_columns(frame: _Frame, /, *, col_tag: str = "tag") -> _Frame: return ldf -def tag_enum(frame: _Frame, /, *, col_tag: str = "tag") -> pl.Enum: - """Extract an **ascending** order ``pl.Enum`` from ``col_tag``.""" +def tag_enum(frame: _Frame, /, *, tag: str = "tag") -> pl.Enum: + """Extract an **ascending** order ``pl.Enum`` from ``tag``.""" return pl.Enum( - frame.lazy() - .pipe(sort, descending=False) - .select(col_tag) - .collect() - .get_column(col_tag) + frame.lazy().pipe(sort, descending=False).select(tag).collect().get_column(tag) ) From ca1b500c220a5ef7042bac75070d679696923cc8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:57:38 +0000 Subject: [PATCH 092/137] revert: Remove tags schema files --- tools/datasets/_metadata/tags-schema.json | 10 ---------- tools/datasets/_metadata/tags_npm-schema.json | 8 -------- 2 files changed, 18 deletions(-) delete mode 100644 tools/datasets/_metadata/tags-schema.json delete mode 100644 tools/datasets/_metadata/tags_npm-schema.json diff --git a/tools/datasets/_metadata/tags-schema.json b/tools/datasets/_metadata/tags-schema.json deleted file mode 100644 index 80f248a66..000000000 --- a/tools/datasets/_metadata/tags-schema.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "tag": "str", - "sha": "str", - "trees_url": "str", - "major": "int", - "minor": "int", - "patch": "int", - "pre_release": "int", - "is_pre_release": "bool" -} \ No newline at end of file diff --git a/tools/datasets/_metadata/tags_npm-schema.json b/tools/datasets/_metadata/tags_npm-schema.json deleted file mode 100644 index 90ea9d52e..000000000 --- a/tools/datasets/_metadata/tags_npm-schema.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "tag": "str", - "major": "int", - "minor": "int", - "patch": "int", - "pre_release": "int", - "is_pre_release": "bool" -} \ No newline at end of file From 5bd70d11bce05e75ffce42274ffe5307aaf5cf21 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 19:21:24 +0000 Subject: [PATCH 093/137] ci: Introduce `datasets` refresh to `generate_schema_wrapper` Unrelated to schema, but needs to hook in somewhere --- tools/datasets/__init__.py | 21 ++++++++++++++++++--- tools/generate_schema_wrapper.py | 3 +++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 5e2ca1dd7..b0730bd32 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -50,6 +50,7 @@ def __init__( self, out_dir_tools: Path, out_dir_altair: Path, + out_fp_typing: Path, *, write_schema: bool, trees_gh: str = "metadata", @@ -78,6 +79,7 @@ def __init__( "gh_trees": self.github._paths["trees"], } ) + self._fp_typing: Path = out_fp_typing @property def github(self) -> GitHub: @@ -87,8 +89,16 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - def refresh(self) -> pl.DataFrame: - """Update and sync all metadata files.""" + def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: + """ + Update and sync all dataset metadata files. + + Parameters + ---------- + include_typing + Regenerate ``altair.datasets._typing``. + """ + print("Syncing datasets ...") npm_tags = self.npm.tags() self.write_parquet(npm_tags, self._paths["npm_tags"]) @@ -97,6 +107,9 @@ def refresh(self) -> pl.DataFrame: gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) + + if include_typing: + self.generate_typing(self._fp_typing) return gh_trees def reset(self) -> None: @@ -218,9 +231,11 @@ def generate_typing(self, output: Path, /) -> None: ruff.write_lint_format(output, contents) +_alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets" app = Application( Path(__file__).parent / "_metadata", - Path(__file__).parent.parent.parent / "altair" / "datasets" / "_metadata", + _alt_datasets / "_metadata", + _alt_datasets / "_typing.py", write_schema=False, ) diff --git a/tools/generate_schema_wrapper.py b/tools/generate_schema_wrapper.py index e024c2ca1..39b672082 100644 --- a/tools/generate_schema_wrapper.py +++ b/tools/generate_schema_wrapper.py @@ -1373,6 +1373,8 @@ def generate_encoding_artifacts( def main() -> None: + from tools import datasets + parser = argparse.ArgumentParser( prog="generate_schema_wrapper.py", description="Generate the Altair package." ) @@ -1387,6 +1389,7 @@ def main() -> None: output=EXPR_FILE, header=HEADER_COMMENT, ) + datasets.app.refresh(include_typing=True) # The modules below are imported after the generation of the new schema files # as these modules import Altair. This allows them to use the new changes From 012f98b9516ddb05dfb6888e802f3d0c894f206f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 19:34:28 +0000 Subject: [PATCH 094/137] docs: Add `tools.datasets.Application` doc --- tools/datasets/__init__.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index b0730bd32..f66c22795 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -42,8 +42,27 @@ class Application: """ Top-level context. - When ``write_schema``, addtional ``...-schema.json`` files are produced - that describes the metadata columns. + Parameters + ---------- + out_dir_tools, out_dir_altair + Directories to store ``.parquet`` metadata files. + out_fp_typing + Path to write metadata-derived typing module. + write_schema + Produce addtional ``...-schema.json`` files that describe table columns. + trees_gh + ``GitHub.trees`` metadata file name. + tags_gh + ``GitHub.tags`` metadata file name. + tags_npm + ``Npm.tags`` metadata file name. + kwds_gh, kwds_npm + Arguments passed to corresponding constructor. + + See Also + -------- + - tools.datasets.github.GitHub + - tools.datasets.npm.Npm """ def __init__( From 5e677c05447e177a5bcd78086a2f080584b731e9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 12 Nov 2024 20:10:19 +0000 Subject: [PATCH 095/137] revert: Remove comment --- tests/utils/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index 36ed1b097..2e8ae1214 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -137,7 +137,6 @@ def test_sanitize_pyarrow_table_columns() -> None: ) # Create pyarrow table with explicit schema so that date32 type is preserved - # error: Argument 1 to "schema" has incompatible type "list[object]"; expected "Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType]" [arg-type] pa_table = pa.Table.from_pandas( df, pa.schema( From a99d2c924786f3a2585f2f84bc4641002f9bafce Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:44:10 +0000 Subject: [PATCH 096/137] docs: Add a table preview to `Metadata` --- altair/datasets/_typing.py | 36 ++++++++++++++++++++++++++++++++ tools/datasets/__init__.py | 42 +++++++++++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 270ac9ab8..c13f847c0 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -184,6 +184,42 @@ class Metadata(TypedDict, total=False): https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix .. _vega-datasets release: https://github.com/vega/vega-datasets/releases + + Examples + -------- + ``Metadata`` keywords form constraints to filter a table like the below sample: + + ``` + shape: (2_879, 9) + ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐ + │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag ┆ url_npm │ + │ a… ┆ or… ┆ e ┆ li… ┆ ┆ --- ┆ --- ┆ --- │ + │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ str ┆ enum ┆ str │ + │ str ┆ bool ┆ str ┆ bool ┆ ┆ ┆ ┆ │ + ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡ + │ cars ┆ true ┆ cars.jso ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ + │ ┆ ┆ n ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-2 ┆ true ┆ flights- ┆ true ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │ + │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-2 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v2.9.0 ┆ https:// │ + │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ + │ unemploym ┆ true ┆ unemploy ┆ false ┆ … ┆ .json ┆ v2.7.0 ┆ https:// │ + │ e… ┆ ┆ me… ┆ ┆ ┆ ┆ ┆ cd… │ + │ ffox ┆ false ┆ ffox.png ┆ false ┆ … ┆ .png ┆ v2.5.2 ┆ https:// │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ cd… │ + │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ + │ flights-a ┆ true ┆ flights- ┆ false ┆ … ┆ .csv ┆ v1.18.0 ┆ https:// │ + │ i… ┆ ┆ ai… ┆ ┆ ┆ ┆ ┆ cd… │ + │ income ┆ true ┆ income.j ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ + │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ + │ burtin ┆ true ┆ burtin.j ┆ false ┆ … ┆ .json ┆ v2.8.0 ┆ https:// │ + │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-5 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v1.8.0 ┆ https:// │ + │ k ┆ ┆ 5k… ┆ ┆ ┆ ┆ ┆ cd… │ + │ wheat ┆ true ┆ wheat.js ┆ false ┆ … ┆ .json ┆ v1.18.0 ┆ https:// │ + │ ┆ ┆ on ┆ ┆ ┆ ┆ ┆ cd… │ + └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘ + ``` """ dataset_name: str diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index f66c22795..44c766850 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -204,6 +204,45 @@ def generate_typing(self, output: Path, /) -> None: f".. _Path.suffix:\n{indent * 2}https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix\n" f".. _vega-datasets release:\n{indent * 2}https://github.com/vega/vega-datasets/releases" ) + import textwrap + + examples = f"""\ + Examples + -------- + ``{METADATA_TD}`` keywords form constraints to filter a table like the below sample: + + ``` + shape: (2_879, 9) + ┌───────────┬──────────┬──────────┬──────────┬───┬────────┬─────────┬──────────┐ + │ dataset_n ┆ ext_supp ┆ file_nam ┆ name_col ┆ … ┆ suffix ┆ tag ┆ url_npm │ + │ a… ┆ or… ┆ e ┆ li… ┆ ┆ --- ┆ --- ┆ --- │ + │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ str ┆ enum ┆ str │ + │ str ┆ bool ┆ str ┆ bool ┆ ┆ ┆ ┆ │ + ╞═══════════╪══════════╪══════════╪══════════╪═══╪════════╪═════════╪══════════╡ + │ cars ┆ true ┆ cars.jso ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ + │ ┆ ┆ n ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-2 ┆ true ┆ flights- ┆ true ┆ … ┆ .arrow ┆ v1.31.1 ┆ https:// │ + │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-2 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v2.9.0 ┆ https:// │ + │ 0… ┆ ┆ 20… ┆ ┆ ┆ ┆ ┆ cd… │ + │ unemploym ┆ true ┆ unemploy ┆ false ┆ … ┆ .json ┆ v2.7.0 ┆ https:// │ + │ e… ┆ ┆ me… ┆ ┆ ┆ ┆ ┆ cd… │ + │ ffox ┆ false ┆ ffox.png ┆ false ┆ … ┆ .png ┆ v2.5.2 ┆ https:// │ + │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ cd… │ + │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │ + │ flights-a ┆ true ┆ flights- ┆ false ┆ … ┆ .csv ┆ v1.18.0 ┆ https:// │ + │ i… ┆ ┆ ai… ┆ ┆ ┆ ┆ ┆ cd… │ + │ income ┆ true ┆ income.j ┆ false ┆ … ┆ .json ┆ v1.21.0 ┆ https:// │ + │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ + │ burtin ┆ true ┆ burtin.j ┆ false ┆ … ┆ .json ┆ v2.8.0 ┆ https:// │ + │ ┆ ┆ so… ┆ ┆ ┆ ┆ ┆ cd… │ + │ flights-5 ┆ true ┆ flights- ┆ false ┆ … ┆ .json ┆ v1.8.0 ┆ https:// │ + │ k ┆ ┆ 5k… ┆ ┆ ┆ ┆ ┆ cd… │ + │ wheat ┆ true ┆ wheat.js ┆ false ┆ … ┆ .json ┆ v1.18.0 ┆ https:// │ + │ ┆ ┆ on ┆ ┆ ┆ ┆ ┆ cd… │ + └───────────┴──────────┴──────────┴──────────┴───┴────────┴─────────┴──────────┘ + ``` + """ descriptions: dict[str, str] = { "dataset_name": "Name of the dataset/`Path.stem`_.", @@ -221,7 +260,8 @@ def generate_typing(self, output: Path, /) -> None: f"{param}\n{indent * 2}{descriptions.get(param, DESCRIPTION_DEFAULT)}" for param in metadata_schema ) - + f"\n\n{links}" + + f"\n\n{links}\n\n" + f"{textwrap.indent(textwrap.dedent(examples), indent)}" ) contents = ( From 7e6da39db8f9bbb691c5a734b2ed96e953fe35f4 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 11:49:30 +0000 Subject: [PATCH 097/137] docs: Add examples for `Loader.__call__` --- altair/datasets/__init__.py | 88 ++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 7 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index d6acbf4c2..d3a93cfa7 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -35,6 +35,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): from altair.datasets import Loader data = Loader.with_backend("polars") + >>> data # doctest: +SKIP Loader[polars] .. _vega-datasets: @@ -96,7 +97,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: data = Loader.with_backend("polars") cars = data("cars") - type(cars) + >>> type(cars) # doctest: +SKIP polars.dataframe.frame.DataFrame Using ``pandas``: @@ -104,7 +105,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: data = Loader.with_backend("pandas") cars = data("cars") - type(cars) + >>> type(cars) # doctest: +SKIP pandas.core.frame.DataFrame Using ``pandas``, backed by ``pyarrow`` dtypes: @@ -112,10 +113,10 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: data = Loader.with_backend("pandas[pyarrow]") cars = data("cars", tag="v1.29.0") - type(cars) + >>> type(cars) # doctest: +SKIP pandas.core.frame.DataFrame - cars.dtypes + >>> cars.dtypes # doctest: +SKIP Name string[pyarrow] Miles_per_Gallon double[pyarrow] Cylinders int64[pyarrow] @@ -131,7 +132,6 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: obj._reader = backend(backend_name) return obj - # TODO: docs (examples) def __call__( self, name: DatasetName | LiteralString, @@ -163,6 +163,80 @@ def __call__( https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix .. _vega-datasets release: https://github.com/vega/vega-datasets/releases + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.with_backend("polars") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + shape: (560, 3) + ┌────────┬────────────┬────────┐ + │ symbol ┆ date ┆ price │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞════════╪════════════╪════════╡ + │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ + │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ + │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ + │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ + │ MSFT ┆ May 1 2000 ┆ 25.45 │ + │ … ┆ … ┆ … │ + │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ + │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ + │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ + │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ + │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ + └────────┴────────────┴────────┘ + + Using ``pandas``: + + data = Loader.with_backend("pandas") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + Index(['symbol', 'date', 'price'], dtype='object') + + >>> source # doctest: +SKIP + symbol date price + 0 MSFT Jan 1 2000 39.81 + 1 MSFT Feb 1 2000 36.35 + 2 MSFT Mar 1 2000 43.22 + 3 MSFT Apr 1 2000 28.37 + 4 MSFT May 1 2000 25.45 + .. ... ... ... + 555 AAPL Nov 1 2009 199.91 + 556 AAPL Dec 1 2009 210.73 + 557 AAPL Jan 1 2010 192.06 + 558 AAPL Feb 1 2010 204.62 + 559 AAPL Mar 1 2010 223.02 + + [560 rows x 3 columns] + + Using ``pyarrow``: + + data = Loader.with_backend("pyarrow") + source = data("stocks", tag="v2.10.0") + + >>> source.column_names # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + pyarrow.Table + symbol: string + date: string + price: double + ---- + symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] + date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] + price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] """ return self._reader.dataset(name, suffix, tag=tag, **kwds) @@ -203,7 +277,7 @@ def url( from altair.datasets import Loader data = Loader.with_backend("polars") - data.url("cars", tag="v2.9.0") + >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' We can pass the result directly to a chart: @@ -231,7 +305,7 @@ def cache_dir(self) -> Path | None: data = Loader.with_backend("polars") data.cache_dir = Path.home() / ".altair_cache" - data.cache_dir.relative_to(Path.home()).as_posix() + >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP '.altair_cache' """ return self._reader._cache From b49e679e58729930513a54d13f039039bc9a0837 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:02:43 +0000 Subject: [PATCH 098/137] refactor: Rename `DatasetName` -> `Dataset`, `VersionTag` -> `Version` --- altair/datasets/__init__.py | 10 +++++----- altair/datasets/_readers.py | 15 ++++++--------- altair/datasets/_typing.py | 6 +++--- tests/test_datasets.py | 10 ++++------ tools/datasets/__init__.py | 4 ++-- 5 files changed, 20 insertions(+), 25 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index d3a93cfa7..3760a4f2a 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -21,7 +21,7 @@ else: from typing_extensions import LiteralString from altair.datasets._readers import _Backend - from altair.datasets._typing import DatasetName, Extension, VersionTag + from altair.datasets._typing import Dataset, Extension, Version __all__ = ["Loader", "data"] @@ -134,10 +134,10 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: def __call__( self, - name: DatasetName | LiteralString, + name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | None = None, + tag: Version | None = None, **kwds: Any, ) -> IntoDataFrameT: """ @@ -242,10 +242,10 @@ def __call__( def url( self, - name: DatasetName | LiteralString, + name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | None = None, + tag: Version | None = None, ) -> str: """ Return the address of a remote dataset. diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 57b290c32..9b0e7007c 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -60,7 +60,7 @@ else: from typing_extensions import TypeAlias - from altair.datasets._typing import DatasetName, Extension, Metadata, VersionTag + from altair.datasets._typing import Dataset, Extension, Metadata, Version from altair.vegalite.v5.schema._typing import OneOrSeq _ExtensionScan: TypeAlias = Literal[".parquet"] @@ -129,10 +129,10 @@ def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: def dataset( self, - name: DatasetName | LiteralString, + name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | None = None, + tag: Version | None = None, **kwds: Any, ) -> IntoDataFrameT: df = self.query(**validate_constraints(name, suffix, tag)) @@ -156,10 +156,10 @@ def dataset( def url( self, - name: DatasetName | LiteralString, + name: Dataset | LiteralString, suffix: Extension | None = None, /, - tag: VersionTag | None = None, + tag: Version | None = None, ) -> str: frame = self.query(**validate_constraints(name, suffix, tag)) url = nw.to_py_scalar(frame.item(0, "url_npm")) @@ -398,10 +398,7 @@ def _parse_predicates_constraints( def validate_constraints( - name: DatasetName | LiteralString, - suffix: Extension | None, - tag: VersionTag | None, - /, + name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / ) -> Metadata: constraints: Metadata = {} suffixes = ".csv", ".json", ".tsv", ".arrow" diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index c13f847c0..e9546d2b1 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -17,9 +17,9 @@ from typing_extensions import TypeAlias -__all__ = ["DatasetName", "Extension", "Metadata", "VersionTag"] +__all__ = ["Dataset", "Extension", "Metadata", "Version"] -DatasetName: TypeAlias = Literal[ +Dataset: TypeAlias = Literal[ "airports", "annual-precip", "anscombe", @@ -95,7 +95,7 @@ "world-110m", "zipcodes", ] -VersionTag: TypeAlias = Literal[ +Version: TypeAlias = Literal[ "v2.10.0", "v2.9.0", "v2.8.1", diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 1b866cf58..6d349dc9b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -12,7 +12,7 @@ from narwhals.stable import v1 as nw from altair.datasets import Loader -from altair.datasets._typing import DatasetName +from altair.datasets._typing import Dataset from tests import skip_requires_pyarrow, slow if TYPE_CHECKING: @@ -333,9 +333,7 @@ def test_reader_cache( @pytest.mark.parametrize("fallback", ["polars", None]) @skip_requires_pyarrow def test_pyarrow_read_json( - fallback: _Polars | None, - dataset: DatasetName, - monkeypatch: pytest.MonkeyPatch, + fallback: _Polars | None, dataset: Dataset, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.setenv(CACHE_ENV_VAR, "") monkeypatch.delitem(sys.modules, "pandas", raising=False) @@ -348,9 +346,9 @@ def test_pyarrow_read_json( @datasets_debug -@pytest.mark.parametrize("name", get_args(DatasetName)) +@pytest.mark.parametrize("name", get_args(Dataset)) def test_all_datasets( - name: DatasetName, polars_loader: Loader[pl.DataFrame, pl.LazyFrame] + name: Dataset, polars_loader: Loader[pl.DataFrame, pl.LazyFrame] ) -> None: """Ensure all annotated datasets can be loaded with the most reliable backend.""" frame = polars_loader(name) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 44c766850..c1c7e0655 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -182,8 +182,8 @@ def generate_typing(self, output: Path, /) -> None: .to_series() ) indent = " " * 4 - NAME = "DatasetName" - TAG = "VersionTag" + NAME = "Dataset" + TAG = "Version" EXT = "Extension" METADATA_TD = "Metadata" DESCRIPTION_DEFAULT = "_description_" From 7a14394093cba4b78613f0afe0754a8d0886d966 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:18:49 +0000 Subject: [PATCH 099/137] fix: Ensure latest `[tag]` appears first When updating from `v2.9.0` -> `v2.10.0`, new tags were appended to the bottom. This invalidated an assumption in `Loader.(dataset|url)` that the first result is the latest --- altair/datasets/_metadata/metadata.parquet | Bin 19128 -> 18921 bytes tools/datasets/github.py | 7 ++++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index 969f64b18f44b812f11e0e1f34a58c6b592c994a..d47c4ebed0528df5c68dedf307f03f66fec5e63f 100644 GIT binary patch delta 12802 zcma)?2Uru|y6)2m9YP5Z2oNAt5vkI9?;VjQy$MPeX$id}5I{hY-m8F$i1aSKD2Pgz zA_#~gO}J72d+&3edu~~oJekZ|Yi2U@{_?%wtkseU+`<7H+(E!K+-(pKfhq!k=Elv_ zBXP~3A+Wvm7d zT~f>{FkXX;R;$ma%Tbg`ZvcLy}YYVx|vJiiIWiBDqt~ zd)NJ>ZN`6sq0vg8lbn0D0u{hYU0w?4lmvp}OrbDUlAjS6o~WkKYnWPKU_?uj)8lP2?D}OI2J;Gl{v~clAVIZrc8A$!NDUEpom#qXmEG|v5lZpGAgR$ zu@lj_;-j|)IR>F61eIkBTDGC}f{EUdR3oRbEb@Xc1hznf;2|rhD9oEO@rtU)@Hti@ z&hZQ@i3SKL4S~N9f}qjbBJNAj{laY9RO2t{FP{4)VGd@=+$X=PU6cicPy3vdZ1MUK2}2oM7Uj36+&31+Y#EDB_WC5_Hf;(chZ zJgH=-Nv-%Xgqlv|Wa+_#0H~GqYgdKSMOb0>=LfdVtwe5bZ4Z8u0@z~w0azj!%7FU= zwCiAmsM%?8UU7=aUez*%K}Q?t^<^`>v;uV&3&e#(Ss;%=Tsuw0&_xjNF)or=kbsjB zpu&sSxIT`MBnu}8X2np%tFBTJ|8GLhNLY^;a=rdIWrN02-w1zO-yQzKD*o0fOB&mM zROR#9wh>Y5{OaGT#8VNd@>|M^Zg?qc|CaI}CI1#B0x!xFLFJ;)j?KA;M4uPjCuoyb}=fW{}#}C`lS2Vf{47WmqwHziiey~#lCVR zwL((ZV)TCSXNvfVqJuaH1PUp|gUSCdS>q&_$72!HV(R}!T3fAJ^6oh17q*Ly%0h-6 z7+zwlV8BQb*K3r`X6XBgk^xCw*vA0NcxvLTF@qF9)P8Q0^~D4GlXQiQNXwj0WClrJ zA$d&#t(0isJLD?1v77Vymd~i+Kis_GES`P*!X@ktUgz#9c%3VaQc3;MdGGIk2>gYe zMpl~fV-Rhh9uQq+(HTv1?@ON?`0^``S@E~jO9=g*+S&x%x?|_*1Sg}r;HO(rpvyvVtue~r9uOl5;Q)9u1YY(V0}LZT7XJ+xr4(}ZF^YZMDp->lR;${ zw=qp0-vS4OS&XD4PL`|~Iea_w=jm6BJ(K?sn9N*vwamgWVZko7^Fe6PCdN9J zwMpV58}n3}5bL&8bI#a)8%>tkF*@8rR7$(oN1scv zpkFSOhDIh!y|(gc(G6Mld~wcwkX|z|e`u<-@V&CuB0_!Or0?369^_vC=ktPvcU9Gq z37&gDLsvF3cN9Y$Jy^99+*tcXj11ewJms8QRYwuVk~8vS_JYH73&xMKYvwQmyzVF0 z_Fn7nbEyxnkZ=rD`30;w_%PJ@wbQ;BtvEF*(BZM#aYSgSIUx*9yxe_dF>T9wb% z`UNhoB|Rg>A7hU9B4TL9l6ZP2 zLgJ`AJk6m11=vA-f(11@PFJMK1V#(Q$nliGN)uYsFwi>U0XEYg4u&UusFc;dao^XF zdmOPO7t`%!p)9KqF8Oi`Wux5u&Xg&jwAe;cJ127Zxw~l`m;Vt(Otgn%uK{&t>xkB` z3!eIp*t>_LN_6#RpEiityX-;&zFi4P^uE>EX&Ru#wZ3?1Y8am8$q;(+vvJN96Ir=- zkCm@nh`@4E)=|o0$bT<3j>J#-aoEm2+;5q@a{4&aEx>ey(ppYdhAZvXme?qVZtBg~ z6_i6=8|f8^7|tPC4v^WU%H?Tqjh64TclVb@Kerc@_|;9=94K-Yt>~Vhv#(66>^_j9 zI4JZdOeBr&j+AF@A4=-BtpBj!z&$&2YIRSwPh{7#p{c`uooi7x)W4w69o073dOgT6CyA%iX=0$_QlrrE|$nAB)??J7K=^w>u=Qasgy~0 zF3|7Y)sBZ``0tFS!5ZtS-}Tm$-D#V7{o<_rkXdC}sb^x0FSg(fi=0Y}%&ikMmTm8? z$`Hpo9bIAiI_+`GC7HwNvllwaWuLf6>HDLzjL`gP4$ z!VqMSi)yA0LO0w?`C1a#PQQbUIlMS64d44S=zQ-W)w?Bnt;VkDG`~9{l+E9aPR69K zjuAwD5>q>V;D0av2)^N66UbJDkHYL7_)#Me!YI>fmXR#Wfh-uZ~t*suXkPG zdrsAmwB5m4-}wML&R*XDa1c?(hkA)PT8oL0dAWu@ai-Fg9kqlXngODGyKdcLL0^bS zmEEJCydI^VB)CHn==;b{(w*(lUf!+UDfB_cvT&QO^kme}C$zQ1v*OV;Cm8o{?{-7^ zTu>{gRrgP{N!UD|6$RJpBFCJPDxK=w!~K1QA~+Kp=2MN0@1FU`z9EsO@SNGx&3=JU zK1vc5p$nEGE-@&|kbQ`t%Tsq;rtt;`Ip^kvwh*s~XP|SQ+1dO?M7;5*D8j^Jg8?iTs~>FSV+Wq|Iu5MlZA0APmJSF9x)5;;ZX5| zym`g*qzAKeE88`&)4=#lA-kipcVZqGpyB8)fYCZ~19|xTjhD44%3fKU(@-}^ZM9@%?p!&hbZz&j2GpB}>iBz_!7TX&Eo^|jd?+zJSX;}jh;V>Wh{IY)q z6$-))AyHIfP>fq4T*{$9odaM*00=uEPQe=J^$s72{eA108&aFnC=0+* z@VCpQCL>f)dTJ|*q|{JJFEPo`b~eVff;e#245VyhVD`d5Va60fG`pq|-!NxW<-AwH z6G|R5W13bO4PR*jE}nq}L$Co%Gmt@$4}huD(53M8^!XVN34?J$TIysC&S~OIBh{N% z-;vVcMqR}g$RAGc2Ffl{GSL1PQ+Mi%*9I~Xj$V&K8 zF7M1C=4Fh^+&|8{qUE>qUhiEjMq7mfDauuq*;B`w zIyR~RKE69I@g5wrL+#SXK@DSoRgCpW7(SEBQIgHStr5kXV2O9+&`&Xc9XT11$%ftI z3XA-4u_$JGG!2n-y$;t{Laet#j7WgV2cIEV*Zu2{U*D3{b^mcQ88JTh*QFbJ<6U}T z*Me8rx@c7r&Fn}}e+oJIm2yLJIRr&N*^iJh%p8@J&E5^YRmMxm=ZcL; z=E@@b-_Bj)3QG^rjyv=HTpAx=)@0{z|2}6O+RjXv;>~P`YT(L1G=U|AI!$SPh`tjH z_G53KOdNRRXp+9(tp(jREYm$Ew~^^|(wV#t5)YyBBkJ$5M)pgPnPC-KvcsCccW6V& zAOtjkjwhE-dNi3O8t^Thg2Kb6OAz8Pk>EEe$Z+2K1e~cg<_S>td0p%#effus81j^2 zC8q))ghHafIT#K6(mr4IkbEkERV*A8{~w1B_I0)U>#KL}(KU#3J;&8D*}_{0bl`9Q z9f&jh!Qv=PHY4gk1Uz>HY;=gDE+loK%z^862f zJ?p*`7a!c{XaPVhUYlz zpQk)y3G3-2uGf)e8#MZs5?5q$p6Jgy{&$XfD2G;R%ms#$oDUuG2xI<9nN4y37lwgV zc!mwP(*LH+y*Hm)r6WdhTPHfo?AuN6|1kW+{of2f?Y;SV=zEBgbF5JCQ7af^f+x6W zit{hQ@lEzG(IUCr(}y`RDl=4-m_H9+xL*7w`2Cmb^0qKM!A*iFg-k*fc0TLr)HtcN z=Xij1gFk+@Kl(#&g-WURUwUy*bjDp6sSW?3H}}JDdfn<5`2V7d&zrG)lpj@0S{*CP z*r@!ghCF|x+lwcc0ah@qEKZdvfVnR$oA0Kf!tC>#+p^bp{+&r1tQS6t(|U-%hU20t z%=F*mr1rjW;?cbWb{6zsz>{mg>;LHW;yoQlKO^$&o|(Vo>fXVV8~^N`XV{`px*yFX z{%3cVrRdh2rKFJwASj?C-$S}*exv)7nZ~8h0fDi`<9pcaqvTaA zWVS9k-UnIa*Y+dL|4<8vL<#^84PGU%3Pe!Y-s6|dv9XSD!Fcl*HHW{#?y$g? zjKC&CBnwJkZ<=+MG{7nl!uPN(J+oLU8i3Q!569BR1b`;EC%uvzHktVG6NQN-kEtZ4 zxSL?F2S_aM{Rm-SODRt%yOVd`4^ji?^if5!1_>WiD5Js>)3pKlohg$xVcl}M+muzITy!;@>)nZRpP0?g-7{dmPT{Ap+II$d zUad#pa>XX^JkcogbJxH2ZexV6LjgB}dX91>}8F1Y#@EZ2El!zes&k{xFnUh|m z<2LK@{^L}ZZvrV3O6Y|lO zkq3C0T(}c?;|j!Oqo*{DM$F!QC-=3;o8((J$aN(zbZHl*H5p?`0N~PGMO&Cy_0a5g zODfQ;OSId9!JKpRmaxVA9D9lXIP@;&It@j8(iW4iqD5`<)x5n;MmwYT{Le7pyRG|U zR+-nY@i)jmwEIQqb}4L0p3{Ck;g-{jgpViHIQLI7&*NsfJ*D1qeQgU;LTZ{RSKXI= zR@Kn&G~A>(x25Ie`*=+D&Q`Z9#ciKiwnB9(-k*2PLeg2pjkAi^p>J*GkR;^v#F&@} zQnkhcL+qdxlFO#cfIoLwQK@%?cc3%D!uPPLrf7D=L64En#xc1%rk-Mcc;H^#a867R zeAMJ(Jfg$Z|D~^pRfnd3izMGq+3B!%o072y#_VYt;fJ$szXC1trXAdCJT9|so(?uk zV>>H^ZnBGT3LB-OHGOT>(29(?yBO-ueb*#51qp~E5iAB^M??#XMzI+X^%HfnW=A+w z`M{7>6$w?mHGEni$~&eid~>aQz>yygrZ^b|l%=_a>cW<5wFV4mP!}=nN6eppL-(Fr zJ7A?u$^mSsT;rn{_0&GX=;re48b%?vY&fxE!Fi2F?&HaXt}ur((<0Z~m}k^Y#4fdg zT&i>KD;f%x&-{(<^wE~MMiA67>RhM4L8;lcUbVKK^0Hgu^VDoqy0&%Sd?3*)&>*@@ z)$PqEzK6)6 z>@19xw^)zZ$GM0GdyofzG&N>yumW|;<|J?`4v${NrMeqhSXI5q^>D8V|{=sqYP%-S&Vc>3UNxZ z&a-r+FC(LNzlWg>-P7i|&384?J_4Og7}4vi*Zoq|uIdAwHtt&jfMHrnRkNvNVDe-O zLS&jm0x;Jwwz#RKpsOq)AcXCz*hhdHxYRUrDX8PpA;l052}PxS2gnr&8Sny_Oi5uD z?Ijqf62dx&Ee2moVxVu?yT`>|PK2T3ot{ljy)g`*dZiOvLa!aG`)>2c(#9UA9ETil z2#X)jiC9-+|6PcF?Az5Y`76h7$_(pz*~(tzvM%Px-q7%0PAYVT-(pJk3sOB9oT^@< znHDd9pk7&)lYT|CVMupsd5UzCqx7EI>o)CYSa`e1WR0LIlnx_PHn_deLmbuVZE2G0 zo{Fj{6Wk|ylz#QZ$Cw-pSfvH{Ryi&6`|};BxRx84-guIpdMU-%Um%wmAus8)+S;Gh$xe%Q^YFd0O}|Lq9r#n+f))!aB4-> z@3MV~FNEO{6?Va7-g;y&l7Q5MY!D6zHwkq_KHG5=-~!ss$dS4znod^QP#SU2qH%}3 zO{7HWP-h}8Z(z+QcdM1c?Ae5VPxY@{tVt&SehOxaPk3aXy1~g6%>RuzR57Sn87`>q zKy7ZM(fiz+0i2%KiY@B7BjHPweCG+#24&Y43N;PapY}R2Wz+siysz&rg-|q4h^%KqDuI z2l1E-9rRbDi63XQd(XbBt%tGxLN{FgU>)I%*YXQ$<ncm>{ zi?GXouzI=vKnGg|e?utUHz6z8pxiXTy8l7LT-3yFA4T;Vg8tNT=)zNHXd3mROPDr@ z;Ka)u<>dcl-PP|Zl6Az6#_pLN_i~1pkAgZpJa{4a;nA~a?!lOuEss~^GX(?BcK~Jv z1|wp2oV+)sUm4lX2F`OmTMbgq>%T_dIi(ZYyf0ucfU+Q%1MMzt`+iTVuet70UQafc zH0?;FKO$iBJo(ATOiw7;utbKFwB~cq)KqT?tB+5Ewuf9Ji}dCHzL?_a;q%Ddt3iX#u)WuGmZfA%wETH~Es z&znAVtpY0gJsO2+%Qhfz4ty*4>bu5*w|2hR3H#9w=iLlHJ-QmMgsn-7+QsvY(@yJ` z?c_HIi`ddzMl{ClO`EdbSAo~VD-w)r%cN)%`zr!=?9VE#F(kYa@Pa(Pvd_}=6*F!Y z9R99{n+ERIhIZ6a6a-X3gr*Aa0MpHg6BC*3ZzgOvI+d$q-9Sg~>{HC9!vL)_k@+v1 z4TFL?r0iuM>rBeN$S|<5VZY6W4=j2hSMA{mu91{|slt13sYDDle7A6HXl zQsvGHpo+1in3R4ewhNM#r3MABACHrJ=5CV73H5MuZe!`f9}W5en0W!m7qtN(v_a@uhuq*Ld3hWqyo+MetnDm|w5ROcLMm9$plF5>q%(51RNV`zb1dF})M7@pu4xFS94Y1;$6rJMR{Z&&AkAgMq}_Pcl4+L%+E2MaeANd}S!$5!01$az)T1w7VJ^pjcKb zS~!;7Tvyih{P>Cr?_NB;L-U@|0t+uuQ9v|^h8?t!Q#!~gJlj(tSpTn(_raYTzv zypm}hUaJtPk{joh_GazdftkS}Sz&=!y!jK1rY}VTzF(nJmik&tJ?mVrDV)>2?!6KSKan71A&o2wT48mF9i~oAsKuFXpS&;?*_IDi2M)85*g@jx@`R)A-(V*Wlc zFGn}ul9ZliL43>L)nr97{#Twr)uJ|Y20$`3#mYQDQ$jlRhTTNPN>rVoxtjJTVUY2u z`cn|SNO_=tPBb(&oYCHO?|~cLGJsDD4L^N6Va;rTlslv%WpXYkGcwB4SZ+Ef z|Md++kh|7R_uRo;ctPuI_2M~z!6jR3d%{5`{fS}C%6^)#q``gb%^&&2fmc~1EGoHmL0mXTj!yogcob7Q>4lxmDs6Ap=SZ~!}XDrQ+s%J3i z5By&Doj`MqdgqI6r|5Uj3$bfX8tK{@%*U%wFvCbNt)?UKA~uTksdULT>LS!3gGLvw z0!7Ne#<1z(k6>X-=@Rx9CDTjb22Xhjr26zdFzf=FdB0NOLsq&EAJ$B2+#>p4Zs4M81E|8aZO*VQv zLv_NN34Rrb-&|I7=1B~!e^&YJxb9ctMAOzG^NIoNHi#Vkuzl=n$?(9|#769jc=-kB zCvUuu`XVl@A~~~MmlPZl=kE9iZ}ZD`K(G4()o;j_%3f`p)6B|HkVaTDVYB;l|M^< z43>1!xICugjn{hZZ6Y7(*&zKbf-3VImzN$UQoCepEc4JH#@>r*aLG31lu0)dQ{L@; z-^&?VHqxx{8%0B+-=W4R9IZL^H)_y%*$HprD=7c@{yZ<|p8{6?V=eJ@r(h~7s^1X`-xK5U zC*1rAS-cAA?Ee(%zai|OSVTz-g%AON;8=WT`F~n~V&7NbM!(Rw;{%hf(tm!tx%M5Cch5diA1eo>vrbYPJV{dypil z0eS>8t|>Kb z9@8t*5kp^mH9CDRarg>is>aS*`bt{(+AMj3scN^S-b$yHjsLB|iYb%N&oQZA179-4 z-ykUAJCdnxquIX~z8Ri&(D6MwBFg?2+;TDI^TO&8-chn5C z*+3L~L|zobZTHc7m!wrqtvqDbNyn2DpEwBRDoJzE?VA$AunzB48HVu*RYEBR)`3?K zZAu~5ElTHI9?)X^u?!_0mQiy`7ID~_LX0;HA@oDp>ao< z?qDm5z0P`W_Lny;1R}+YfcbXNXSsuosny4vQpL#6fbilCiR{N@-5dmlrW>w7$Vu#H zZg4;%U<=4bZSB(x3dHcr8Cv}s`#~!jr;9708dUhar%kZZ@`$STMys2jv}8;@3);}3 z&yZ!pVAD`(CK*Lb#{)_1R1PWbMMNM|cD|tR3nOn3nxz?IKga1=N$Sh^X&ZbaV~h{- z$-=e^`oXIoP*?+advs~0D%2Hb>KhaK*%%Fz^{1pm{NKk>%3|Wt&y&1cHo6|zee;Mj z+9(CT-A4r!&5f{uQoi^J{>qNaXRjMHRq~C~Q<3BrZq+Vg-KM^`$KN-jT)gFMCifu% z^s6$}XJ4ri!_5a9_=IxDxUCNM4Kc`khiTUS*bB4zz)*7r)rDIEgyWuciIa>FO-Q)j z<<1%R3oK>GG{7Jfdi{zrgTHB7f=cgAD1KnuAU}!@uQ$KHSpjhsx+7J>u3Z$ac^jw{ zEpQAMT&78>f$ALcdxAp<-!9j3D>j-VkYDEag>T5_qTPKYdx7Uvdxk7r*e<(Q64_kW z5Z>3)IZLKEZ1{LYo=}WQ-rX16qI9!~(Hw_qe$XZ4nZdMpi7Oh+VqKpw1-imNaN}BlQAy*)R8GXl30d3W?Gr1?-V31vP{WglXb|9cfD>nidfy8~DrTq|Rgi)bCM)kw-R>iy%~( z%Q0cHb-+OH zIGL3@@}Pid_+vTFe+!yRQQn8}MSX-fEWlTOt7J;iDuH4`!8g5W08dl%Nrp8|z zC_kz;^z+J*k5(m*-BKvbcQZ`aU5eY(jh2CE#T9=7h+Z10FRz#cE1Yk>y|U%rC7qw5 z4xKI6G)PdlB{@((Hq#RbEk$Y{8#c7P8Dki-C2Lb*AZbc6aZnJU-NXhLl`$w{R=Fp` zXF7TrIE7HYD-ZRmQo-?!QMm;?xeS7YDkQQ?Wl>WT?1L~3|3+hAbPOk(RJu4SSD(=+ zj!{T&Mudc1Rf**(+J5&7@Gy#>L+Bwd7RpfivXmHDt?@MOJz!ag2Lh$5wMX<97ALZd z>44g}FV_vnzN+W=%l~>!^i#_Lb%S)pa`4L0SJ%9Ip7#T9@X>nNlGC2%3ZQR7MySU8 z1{YK~y(GqKRH-2x$uTKJC|#QhP`bXyZYh$J-OWVMf(SoDl@oxUAxh})<8kYbm*?Lh z?Cb_cx^K>0D;Yo;92I9`laoR<5YtRfhaaFuAnA7K;g3Q8tY2`#&;PakNEE&T{Ac?I zYj^_HKkH=y@O{dE*5l%U@G9s(U+U6;@QHuz34bv7H|O6qa1f$K1LAY{t{@oBqGdXr)Vu z{@rQ?--;c;b&Bg!W`X`^<}D>%*`MmDOCVegz%_@1sj#sB5A$N_+W#~g#y7+700i z7x4G#Lj?Q+{TQGEc0N9L?*RaGF)<)GBi$yxgD`;W0%xa+{5u@RAYAaLCN}~EmjH0h ooZ%1t|8q+j>01Bvo5J@i0RiZ+SZ4gZWG*7Qr0-34)U+V~3;G_F8~^|S delta 13034 zcmd^_byOVByXJ?%T?co!5G*(ZcZU#yy9EmtEcD>6gF|qa1VXT&!6CRya8C$Ma9EPB z{_eedcK7eC(`TlutGaro`uWT}Z}qzigKJ2FtFi;b&BI=TnGq>SK-!G3CRB3RM{qyD z#z*-+)n2xI&RK9VX6ED;Dv|3f&rS*l_Zh0rNsfcs9WdrBLXF!WBw4DBqe9lG97y*X_tH0**iDIPll{P&yizbyivjtW`#E^~R z0-o4<0wACpk1BufarsIzge^<+BA7OLdxuNOqhcds;$X zq8%gyHLa6204^8L*Mw^>N(ZP})R3cAmt~etMy{aI*H3til~6_79?a~TNMp~Qm>hPE zCE2XcUC&7k8k`6D>hqXr?@}j{5jZ=03c7)yGS?AwO%n4tZCwuRdxRv)>MHU^lMn{` zn2)p@(vDDM6GUXIFl2;x^H73m&2e53x&i0Q&?$&8%Bdf1bo|qRi*YL13@0Rqqol3vX4&A(Ytv8 z1fB^Q`5L6iHe(P9R|Pys8kd@ysjm`=7nfR)XBlyJ?z=tCDY&}m5i-(*CfZl;$#zX` z^^Oh*hy;Q}LX2tY>1OQe?(FR3;$~?9!63Y2CtN}#%mFj55sC1DnBm}v;Xoh|1OZZ0 zi!=&`OMr!-aw6W6fTWopM8B!aRG(eI`7rY}ix0Q;{q{9n;4jIF(EfMD|B~!)Vfsq` zBfd@V&lWs-Lmt1?`%Ao`U3o`g4~mB=qEi6W#MMc$!n!n5X*66dKBk}i z99+UCCdbQ_S;s|1!9Y>e{afWc^mZ}w3=^nHnw5a=qm(Ee*QKmqEKAXT^ppmJ6)5AO z?a&-va-8r&PN}@Df1yjoDL^k7$w|$Oi7ahaIv7_X$gMZz<+Js`@&QpdF-~F#JZ-hM zRf*tZbys6&`5?mFt&)T>yWs_%w#|a{{6D!$R+23jbQ-x0m?Q@t)E|^i_n`fL8v%-u zVnJ?PJ(09@x~T|;{Fs?*++Vtf{{F#eTGf)FR+nbhO8H0j+|W?%`F3$WJjL%}*I>!7 z4tSures?%jPGsaQ4v-TLToeaIkV~K$a%j9%D5$EzrOR~rY86CV+iUg}q zX9IYdWNCf7lEU^TOQ6r#1Sa1;YkOuWSjQ(}o*r8wVkixv<7}hTVy#*Nx;Ku`AvRbf zP;?1FP;g*CAqhVY59X->mcC-vj9K3}v{+GTx5q@6CN7?H{!8AzGhZ?0ma!zH0CtN( zju1t{xCEoXq(P`cfK)svpIn_7WC;e1cVa+Bw?QBbaU2cWHZy6ONoocY9=v##wh<=~ zkbkOwYqa;;^??7fvb;NNdHZ~x#6@}m8;x4M{M99A*K)N0sdPG6qk#Oyiq`eQ9TF`(f-)Zz?|tmCXM1TW0bz>Zk0I z>-kw-kowb9gbuQ)Ke1(Aq(7O-l_Yh$&W@CCUoBc2-XHNMGcWo6cQ0wKWF zsNqn+0gP3Ex;%{{#GURQ!vH@_(N%Be>E5 zofmr>&o2nqTcQ5tdx^>eZTIDQV7dj)tD6H$pn42RtN%buTh$z`F3+dW?PJ3CC2U;C zPKdzm>?+$%2bpH^5!=LGSt`{hUvE!k+P3kR>2m4RHQpLj34tRF8btSs2|lt26?frx zXW=iBw(Q=Yv}X*xkxH5c7qR1lJoi z0`P54o|J*)Z?TQB72rp3H7C0j+i&7%0JerZv{v8p&@O zD4_b-wxaIiu!_uw$`{Pvrf3`Y&kJhs%H<$(K3~xZIUn0}!e230k}fV&O>^KOo+Zeb z#;i4MQrK1um{8=>NEN*L{BCdFfK7Vc55tSu!#f87EAbiWhfY{Lo+nVRA-3ZaVRY1a z;OANlq_DG12_=p_tyXaUJQs^CLbgTys+Hyy*{51sj?;nXE+`O+Ryx&Fl^;u|xpH%P z3A8KD&hm_nUME?S)??}ckqM*(wS*rod4$TJD=70%kIpk-lL`BtSzo!H$7A7#Qf(Ah z5jHof3y7pT-w19pm3;&Bc(S#EUX7!+Xi#Yq@M37@ydxz0fhWAMcBs9x&2>vz>FwR( z{fvgY+jgPV6VO+rqXiV2E)lhnZIXX(6NxZkq)^z8vP8WCxThmIhE;m<>wj?K$~ z`eXH2CACdDS4+EuMrT9sCGd7y_EMXr9q$?6iv~`PuA5Rk&QlzqT39wJ9~Zq8TTOcq zkH@hC@BE9&3V~%(%z^N8wr8K8VT=_MrIPxy%|(pA>om!`H*~(r^D@QEn7St*+(R#| z`LL49>M@05)5mZBCgy#U(#Q7#vF4(5Ku+dRK~^n>s;19Hs-x1wIard zXLWKYLUsi5xIV%rNg>)^AR@43N+{oxkLYF-Ef1y)#ck{1$#jWK~F!!1(9 z)tSMgQ5AsVS2>ExkXaqu^LyQ5Gs1(X^ZHJC1j?xfxcja5ozXM1rWz3uTyh5GF)k%q za@Ka27f9*OiXJDFx|$~npB7X@=MDR^oK^MGY$a3MYIM4rfu^B>{Bc{Ava$%0YOYq+ zSD9>C9tya|7Q!yy4eAVaQI9egp_E{Tb2cp6U_xHwI&Ax+A=y&8rg*bA2^7(aKIzWU zOF|tTaeBGdPBqeQ{u%H~U#)aw^!l1-)zY0_EYn#wU^q>v@~*Zp$-I<2Q=xg;nttQ% zlRZEZ*3fEa1l%P^qUcxdEWN}JbtN&kODoqiSu}_;>8G{l+nAhO@LOE{&=Dg=cIJb> z<Ddh+bpb=)tsne^mKyD zvWl;@2hP)V^Cxu;LTu8WyI8?h1uCYs0(8uJNsE&wqdIFsv}IdQHk~36?Iuu0I-Wef zjz@_fmDp$e{_KZHCieh8w-9UQ$LF%Syq$se4fH?S{a1xr5)!Q~g(s&Z(iRjYI~_Z} z#Pjd2MK}N(2L{s-ubRDHr;UA~OK6f_ro2RQr{5F^#RROm{6$Aj@_IA7?^Sm=hC(dj z9tpXY;EG7*;P?iJ->jrzQMbZ@pde~62myow;=G58D+htXM8F^@90&}3_+MVZ27z%< zKsYGWAkaL{!-3zWy`-j+#J?Ro__2a0G6V$aVuD1$+==WFHz`28j~^_m8f^lyv}CRY zFFCfnG*4yg5;tOl^VR=d%)Zm0a0>FY7zwK~O2ov~fIlKYg_>Wn9O?G> zNqmU(MD3<`zfEn~S^8gA*(|&!J=%5mGRW>H*3ZhM45T3IW2n4%zTzLDAk6BwV;(Nt zro#*``qMEFI{tFZbvllI?bOKvY<4E%o{ZRG2)tzlG-Gaa4viKc=cfmTfCt~)g=a1E zTw;{_UI(l_{H1ZM6F}<)!j= z|1i&xngy2@D$X#5;RsG@5g4))nDp7Yfw$OroQ=?4odLI=!^_RTO*D|zAIJ|E&B{t; zYa{gdV$|(IDHF29*u1t|VpYGHFxI2b#{ZNv?MumFMDZV%I(6~EQj39%aChOSJ5Y?HNjWl z!iOx(EA@1|^M%(D&aDwQhT3B^5L)Ba=Wv3!-xxx{Pb8b+a`LeeWHT^U`8};dS%R#c zLo1Jii7?H{(fQ<^@BE~I4cb|x&x@LU0D_Tf&Qvfxz6AqsR&TXDegp>8ADw7URJ^?q zX%Ysh3;tyN0vaEnc>>?XV1WDO(DE|K6Q`!k#Lb!9R~u6Z*6#>$ z?=|5SA7cf-2+W^Im+T=ptiptRKvafI|Kp%_!Eew&K`=>jDJ0g{+64Vbun2N*7&3(s zZ*PhY!Y_91N^mr(R1vx^?CJZL=9U{K{{reF;P;1hyd$ zOs^9^Fg^5o_Yzu6*Ss{s#>o8(>&@R-xBd%j&8*Z1tN~ZHe9K>0?~{~C3s3xwHR{89 zSqaa7#aivZV=c!j%|419hY6ZFR$gOamH?%-a)|qr(kv0@kT`G$Ew9Y&YZ8yFVCiq7 zKPK%FAheHLxw#Fw>yxR%3}`Bh15leY#mmtZ7*7@PL}Jb383{;0@I)LRe4tPz%n~Lx zdvy1a4^&i@gD}Rx zifBv`ptz7e7&An*8)#*_HsQ-)dZK^6m7; zpy-w5Jld!WODQ-m5EqCOq@qLKQa3_jl#zVB(Ndmj7;N8al%sEh>h|&2@Vs@2UABgD ztc%rrNSsn)<>+C7DS$ZIox64j8Q+@*Ym!kvG2?x82;VX@kgKwj(=%Pmqe4EXM^DNo z+7a>0Xe|3&wS&+!MppUE*E;J_nd%N+%qKj#gCx6Vc5P{+7YuaiZNeh_OkYA9a#U#C zWv=x?2JjBKVpx|?>s^qnms*g(j|i_G<=1_BpFzm!6|R}XI2HCo^f||7#dY#m(^tR* z5i>QXEHBC@VN-653;q=Xu7Iu?PQG?b<&O4J%(TK39UKz43+H$$s_D9#fv}e`k*Iu0 z9rx(nkMSFyA@xj?_@BCZGANcf?vAfE&C*OTB@V2_iR<`^cyTGQIuJFsbH%ahOo$-| zU=IgH$QfYpIIE1q-8|frU&XMhB)-L=e`#+4^D<#i71;Oyqvv{5M~KpW@3h9G&nh#1 z546d^4S~8q??EHEtoAB`HH8H! zrHBpeS?Z<|CTb)iSoC7UF;F##0b3%}XV%}p5G-#Eh0G=KT@_4EWxQLiIP#uVKbO9= z%Xm^3IWEY~VC9N_H03oGG7ob*>&r+|Jb9r?I$Tuxu_z%%NLoRL@_S=eGW-IBt}EnI z=rl{^rCujNWv@3gF5G9TpiqAFk9-cb3aT02QtB__b3A;vwmv79^x-qei34%sm3k}> ze;>h6e}acMe?e|CZVAr7)1DGBwSh z+Nr0n6*K!a<0MaPXD`Mn;&yCo#qz~QdCJxhX`!o$>wB?Ojdhrgufy=W;XS*7xOr+B z-x-1hm$(F`v$1LUaEjS77J%+YeQCp7UWJX6N`{K@)M$dkifpO9n-jfhgYz+c>*|z7B;q`$OMI>n0Q}f}Xj%n@q?H*m9I{>rQ2SO5i~H zbrbjhT?qI7sB!cBmje7#ca_aIY5s{PMXU#I=6{Jxka=bx86{Y#z?T>&l=l#CFt7V4|2vUfjB z8^xtZ`AO(kHd&LSP`)72YL)6%DZ4pL1;%`an>dVnrTcU5pO5x?YPfrdeV&x!?*V@t ze`+38{e~Dp0_V@|_V4b%=X>PHpCySeFuA!wa9|LK-aH%u4sI)#<^_lV3MY>T0)asn zaLgz+)DQ6n2!scI2sog~AZlv=hbkCE-EaUpym?D~5P+3ZM%=os*6rHEOnX?k9VmXb zK^F?#6Xh>H$>h?cY0dGxD8N?R@i<3qZtFfF3zO7n2~D8eFkU6?#E!}|emcr7u5ugi z#tR~C^L{LH ze&ewf<@3J7&3lKrJRyQ{VEgd$NP6#MuDcv?(VXK2USvD=dVUdbD)`b*$NR=rzyl<4 z#r-3T*EiZozEQOY)m>nf!p7UytIQ!A<%eMAcO3O|X6=^@7aQv18B==~X-*@D^6{5Y zJqu~wbytjI(KpPmvT4R|JyYGopjxQLJv08r#Cr=Pw67A_eXd4Do<#mu;NnS2!+@Bh5ENuuJ1!g(|)%_fn&}5pcjmhV}60Efs_GRT47Jjd1nzS2$_nUBMN|2z?Vo0cO@^u%d}w+DfQgFzwOd#jMm zfuucD5Ij}ghPR>n)hsHH4bq+me682aA2loNp0#OJ6e7W_R6ND0PEwSR8KPr_m`csC z$Vf2-v0NVKCQ4U|pa*QleuNOoeD4X}L>_PSQm>51c}Go;=9kd!<{q{t0hw~nmylti zGLT3+PtTElOYD)eqsQBTsCd&49iSmYRv+tVyr)-DJ!9e@Woh#J)ZKRppsMcSr!)AC zaEA-l@>=`ojf-B|uh>WrPCXHc_5;VtM)GG(LXCMo87P4(Cx_8>##tB-Sr^|?@Qk%c zww7#bV7V&(>`f}#0269`&r)D<2}_V;sY=N!XiA8a3P)-(2Neb<1RW~y_PzBP$N0wm zy#RhEedVP9L=;t62AyLU(8-jJFMqV4rkxumIEJ!S7hXoY8XWG&&VP>guu9oXRi{BE zn4NPq-Yj~3q(fQ6M=EKY?pZGxW6mlWVw|_{pWSi5@$qL{WW8~4xevH9sVC#L8o?qS zC9Ui8&}YMIc(zwQ$s}Ci?dmplNI9R^qXRm5n(@M(QlYhdKP|JD1a>$8vtywnSASUY{<0lCyk$r(CtyPXg#+YTODPKbATF`*Ok*=A- z2~0|Lksr5(9AU!*ixdfp6ii}q6_^K{S(fG;BKfX?HOgw3G8DL31T+Gfw(){Gyt0{C zNdt0`_x-PXNQxNcfEzGT_wLGKBAu=8D%zU8A*s@>>soE;l{T06o`41v3vc5Q9tL~P1B9uBMi(4vKqli^R!&=@!_6gqo*$X- zFeH?Q9gcOya?QH)&L$OE8Cpi#^3pC&+Hf{S;=FzpKDwTC)=VK!;>D>H)h!B_GB-~ z$qIo;YXvzB0B?d1x!Kjco2|R8U)c834%V^cuFX#J(Z=bmZIP(X*Gh|q5~NW4ZOpot zo#S)Y9JfFQeb97#n9%jLLh3wfkW&%ag;h zX?%@S<|fhKk6l77ky2$HUY_l4ua-m-zsqTM*4Dh!2Yf+7GWc@*he_k#Eyr%+oIQ?E z&+;y8ON7@m=1t8`9Xf7Nv=4`h zb88Fp)3@mU4lz?AJg15x*;p|Cb_Y3KP*p5T(MWQ}OvmfkjISprc3Y27$Al*;lDW+1 z0%5M|z;eVF9NZ+Ui~bOLVM9v+(gDgQdX1*;(n1l(+m~ja89{vcJEhk%UeUrjCjE3< zQr=YW?Wy#=8wYD{?36tQarh&H8ocM+6x}?BwL)WZBl2tDuhR-`b(c-K#Ws(;UD7Im zz`Np#)kbqCyK4X5@V-H$3uBaTK9(Mnt1#pX!1UO&3zawtae&U}mJutot1_hT4##3W z=|gw#xBC+k#HiT#oy)zJj=;rm7PRIcc+9U@3x-Y8A(dLCbK}(pM4h|r}3Ko^eb{HKukI6-0}75dyD9T zGH^^Vve;JS=c`XG!)6>k62rY>MTcYkm|qQ*V#Mp)D3>;_+rG)`I^O8DrqsP*HzO$y zhI^;n`$Rt9*MmxLcn<||Uo7C3F6|w*?_}CT)?bc9UE*aqh4y2wEC6)uphR>K+Hew_=rvE^+HShP=wV&%g5!~(rT4No2haTSo4l*Z9c-oT zaQdciO>jnOJd%0kupQPVxJ&Lo%$UGHx`%YcVi{eH**KzCcJ8}1W^G?a`=UkY8pG6BzS?nf2vu<}aF`v1eVh$n z=Sl8%ivY`!ihd$iE6-?BNzD#WdL-R(Q_^D-y~+3yaK!7X?v2Y*>MS5jeVODIy<%%H za+*X^eM3fqWD>Gl`&ml_S@2%n%bGH*t|FP3`>h?b-=h|$doD(0ABs2@KE?M*PtpzX zu-R{N7TLte7*6%?R9Na~0l%azEpgdC)AemXfy9lN9lwFXpm!rCta{8nwR|zT+&#xr z)%vU1yZozfyc&+qZehWWEPQ!ky{p*$=teHA9s4HHF4%J}_0o9oAw7F9J&~e3Bl^Kd zhDj@~y&e(k$2I1~HbNI}0JB(!w?gyY8g3Wz?Dewy@XWCBPc0dH;8T-$w#uiOAdio9 zU0<;2=H`$+)I*94Y%EF2gQPcxk~|&p&SNu*=|$Qrnr7sEgf4Q$9M2tYo^pQ>`I0Ki z91!?5XbVA@l9Li^$1S+Z?@q+vkaUYzc{j^PuH6s6@eZBKlV@q!%d#^GXQTsJp*12l z*Y_|LyT4`faOc!L7!c#6lDVML;{F!KJJNnCVr-K)_q-4_$(|EJrPq1}uhw0|eM{e5 ze_Fme*S(I&Z1^on-dz38y^rR0Rl1n}t$1_vFwe~}Eyr!F2ucv{N#ql9W`WQwz~-do zKnjkHW?{#L94p5Siik~b2pS4+QTRAv7v2{a_Bk){TH(ylK}+GMQ3 z(9u|Bl+O>Z>R`s%IxM_G%B&)9IIDjYac79V^WAjwoYfXYumO|kl;pMj$pqfSM-Ic+ z+kG^i+PHl#t@K!ji5ayF-?B(0a z8`TeLCIG^Taz6-O;b7{Arx-ULOW^$Th4h zTFfgOf)arWr$No({T6H{MmiEA7>58N);g42dGxb1CtM>%v=UhiAHC&za=<#CQ;W_; zz#R1P>j91Jqrgq4B0+RV>YJIyg>~_Hil7Ge2rGAgM1_`7d+yL0F9__FrmPE-SNp4>26ejs@=AhRxS;D@h2$%gYoic8`?nos>s z6m++VEF%YE)MS8|JkN!XD>qvDgdr9e&Fk_ZfxaIMCIgXyC_!y^4`C&hCA~tz!$$b< zGG7NcEYvwHV6kcaIv7r^ApWcRYcr(uzx*Qz#lFA)CrIa^`+YLKLe1}PvI!WP=CuCu zSPaCZaes_!|HlZZ!KhNWG$>fXL8h1Y@S&$AokQPYUI-_IBxwrA_&BmFzF-tEDj49$ zL3!AD134iML#P;zRF8I?2y_9)6bO5QmGGyE49xH zYorlU0+`QA;&t<<%Rk>rNH z?Bz#VLRNbe--dC3^Kjz+8qc1a7xDt<; z_R?$ZLo&i_s}wvGzacCtYqEE_E2e0dtX>9LVwUv^+vC$OSHxc37&BfrY>bPC&4}85 zC~{7rMa_y#Nnx&gCS&rg3kYE{fF8@Q#ACoAm7&`;I=R!S#JS!*rAW2NzT>z`ednlAScJrFWjUEn<12B6%Sgo%@~YtS#i0zE05OjAp#pq} z!`iX{jxa@AiAA@v@Fyh}0?5$8IyF9HqB`}U(&GR#1UwrBd9U3dNr0EMx1>nCXOgHt z_hUnBN<$yL;R>v_r!3DB?$5BFh`EB<_kmxue9WXehtY76o))3#`6}Y_D%D zehz!n$#YfTEc%ryabLNX!kQp3x)ZrO+!sbuIm(oPa`FmprigWe3tK&JX}Ms@oWvV@ zL!A8pB$%wSG~miubpTMUtrBrP%QC!ZIP3y(gNb8hP z(sES#wv;x>2Ak!QSbR`wY~-i9XEd>1P`tA{#xu$qO4~JclW%WB$f{|6X@=zL7hYSID7tA%H{Q@`3)Rq|G z)?vq2IzleP%ol_E6YJ2~((wBL?F$7gaZSh%k+3f^RZ_d3>3zge4Sm$e3#s1qH3C;z zCRa{HFR{rIn*?$77LHOUt1>$S_#@x|v!5)}Ey>t@wjoZ@k;k76&V?rhG9Tmf7jMdN zCD@IPUp~JXzhDB|+c+5Z#a*SnGb(*-Hzi(~U$>cf%Bw`a-4oyBg$qNZQH8#P+H3*Fn#!8}Y{Hv8->X z%KCa&sZwtjF#u`!|FN=i;xWf4-f$Hv5r(Y9b~qDw&a#`OmBZm9%(fXnNeeXO=A48MQi?Bjli&u-HC zNJwZniK=@%7UUE^vehE}x}Yr*c^BKeerNUO`K8UJ(}yxo>#NtkMmdk$CYHa~6wIb> zArQ0fw7+lKeX{!Qz^yJ54yr;TIH=P;P%!D7a5<||FqRcxt zN%Xb36lev2D)xI~VwSe_W#*Vo>^P#H>q5boqK!#dgKvqjW3npY$b*e#h0Di%HsN

| ztV$by*O$B=Oaiu}1AVVDXWVeZWg#v?7cmEL!jH0}KKFYZ)%^);hDPDW`;V(dRudgd zsJ)V|sC@<|n0|iVX{oA(G1!y%J_U76^&d6`?>lC1$(CwAPQgy()rO8f zCAd=q_{k>7%wU;ex9EuOfW2m#IP&{Pfp5j-BR!%&FRWcx-t>{T z#RZAArFt?w%qLwZx~0gp^Nru=`a8kz@*Y2=8y3Z0p)r4+wGwsnxzkO396?2-4QEua zb@}r0{n`lV_3N*G_iqj4Z&WkevewpWQhBAr$ZW+>D0GWojEPEWgkZ)k6a6rzHp9)j zzXt4rC=J$nUhJ;9d0ZpB*|J!Ru*i~<`=&yrar0z{GS&r{|4HtsDbU>nZ5@Q~Kq}8Y zSikGpc-G*ZkJc&?qHi;pjcIgJczNYU5^vM?v9jxPnPQc*;_^5Q`GTamUgV!n zdNLgW7IopqQpqbg@)=>_LaKDmaVMpT9uqX!Un&hYDa6lO*;ls)}2|3{!5hMrv=l%r)2O0fqq=#!lF6@4_K)@sw@(3IQ3JB>y`bo>-kQF$@eLCLHhwL-;_90i@*rZ~5zl6~9l127n>7AjWSz zGcu3$Lw|9sN=9}ilce@#2eU&#=}C@K%r@kb$y3jg(8`+~p_F%aWGG)zsJ_@8Hl zPSOzEf`<;IgdxFT2sel^83soVyQe4l<8dJ|ZtL&HhKEK05aaRPL*xH^t4RrGfA>6m a=ve>;5&pa pl.DataFrame: else: fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) result = pl.concat((trees, fresh)) - return result.with_columns(col("tag").cast(semver.tag_enum(gh_tags))) + return ( + result.lazy() + .with_columns(col("tag").cast(semver.tag_enum(gh_tags))) + .sort("tag", descending=True) + .collect() + ) def refresh_tags(self, npm_tags: pl.DataFrame, /) -> pl.DataFrame: limit = self.rate_limit(strict=True) From 99f823eda9cc51189d3de53c298c9ac861306441 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 12:49:10 +0000 Subject: [PATCH 100/137] refactor: Misc `models.py` updates - Remove unused `ParsedTreesResponse` - Align more of the doc style - Rename `ReParsedTag` -> `SemVerTag` --- tools/datasets/github.py | 15 ++++----------- tools/datasets/models.py | 37 +++++++++++++++++++++++++++++++------ tools/datasets/semver.py | 2 +- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index fe8a0ab33..921fdfc75 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -25,7 +25,7 @@ ParsedRateLimit, ParsedTag, ParsedTree, - ReParsedTag, + SemVerTag, ) if sys.version_info >= (3, 13): @@ -121,7 +121,6 @@ def url(self) -> GitHubUrl: return self._gh.url def rate_limit(self) -> GitHubRateLimitResources: - """https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user.""" with self._gh._opener.open(self._request(self.url.RATE)) as response: content: GitHubRateLimitResources = json.load(response)["resources"] return content @@ -131,7 +130,6 @@ def delay(self, *, is_auth: bool) -> float: return (ms + random.triangular()) / 1_000 def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: - """https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags.""" if n < 1 or n > self._TAGS_MAX_PAGE: raise ValueError(n) req = self._request(f"{self.url.TAGS}?per_page={n}") @@ -145,11 +143,7 @@ def tags(self, n: int, *, warn_lower: bool) -> list[GitHubTag]: return content def trees(self, tag: str | ParsedTag, /) -> GitHubTreesResponse: - """ - For a given ``tag``, perform **2x requests** to get directory metadata. - - Returns response unchanged - but with annotations. - """ + """For a given ``tag``, perform **2x requests** to get directory metadata.""" if _is_str(tag): url = tag if tag.startswith(self.url.TREES) else f"{self.url.TREES}{tag}" else: @@ -390,10 +384,9 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: rate_limit = self.rate_limit(strict=True) stop = None if rate_limit["is_auth"] else self.req._UNAUTH_TREES_LIMIT fp = self._paths["trees"] - TP = ReParsedTag if not fp.exists(): print(f"Initializing {fp!s}") - result = self._trees_batched(_iter_rows(gh_tags, stop, TP)) + result = self._trees_batched(_iter_rows(gh_tags, stop, SemVerTag)) else: trees = ( pl.scan_parquet(fp).with_columns(col("tag").cast(pl.String)).collect() @@ -405,7 +398,7 @@ def refresh_trees(self, gh_tags: pl.DataFrame, /) -> pl.DataFrame: print(f"Already up-to-date {fp!s}") result = trees else: - fresh = self._trees_batched(_iter_rows(missing_trees, stop, TP)) + fresh = self._trees_batched(_iter_rows(missing_trees, stop, SemVerTag)) result = pl.concat((trees, fresh)) return ( result.lazy() diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 044447707..449c412ef 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -34,6 +34,13 @@ class NpmUrl(NamedTuple): class GitHubTag(TypedDict): + """ + A single release's metadata within the response of `List repository tags`_. + + .. _List repository tags: + https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-tags. + """ + name: str node_id: str commit: dict[Literal["sha", "url"], str] @@ -47,7 +54,22 @@ class ParsedTag(TypedDict): trees_url: str -class ReParsedTag(ParsedTag): +class SemVerTag(ParsedTag): + """ + Extends ``ParsedTag`` with `semantic versioning`_. + + These values are extracted via: + + tools.datasets.with_columns + + Describes a row in the dataframe returned by: + + tools.datasets.GitHub.tags + + .. _semantic versioning: + https://semver.org/ + """ + major: int minor: int patch: int @@ -121,13 +143,16 @@ class ParsedTree(TypedDict): tag: str -class ParsedTreesResponse(TypedDict): - tag: str - url: str - tree: list[ParsedTree] +class GitHubRateLimit(TypedDict): + """ + An individual item in `Get rate limit status for the authenticated user`_. + All categories share this schema. + + .. _Get rate limit status for the authenticated user: + https://docs.github.com/en/rest/rate-limit/rate-limit?apiVersion=2022-11-28#get-rate-limit-status-for-the-authenticated-user + """ -class GitHubRateLimit(TypedDict): limit: int used: int remaining: int diff --git a/tools/datasets/semver.py b/tools/datasets/semver.py index f18e1e992..788bbb2a2 100644 --- a/tools/datasets/semver.py +++ b/tools/datasets/semver.py @@ -1,5 +1,5 @@ """ -Parsing/transforming semantic versioning strings. +Parsing/transforming `semantic versioning`_ strings. .. _semantic versioning: https://semver.org/ From dcef1d984b79cf622f418b7e6ecb72214656e62a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 13:22:44 +0000 Subject: [PATCH 101/137] docs: Update `tools.datasets.__init__.py` --- tools/datasets/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index c1c7e0655..c8e67c394 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -1,6 +1,14 @@ """ -Adapted from `altair-viz/vega_datasets`_. +Metadata generation from `vega/vega-datasets`_. +Inspired by `altair-viz/vega_datasets`_. + +The core interface of this package is provided by:: + + tools.datasets.app + +.. _vega/vega-datasets: + https://github.com/vega/vega-datasets .. _altair-viz/vega_datasets: https://github.com/altair-viz/vega_datasets """ From 173f3d6f5c43a0f248502240c8f1bf6ca7536415 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 14:55:31 +0000 Subject: [PATCH 102/137] test: Fix `@datasets_debug` selection Wasn't being recognised by `-m not datasets_debug` and always ran --- pyproject.toml | 4 ++++ tests/test_datasets.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2297ca2ea..e7ce8ca7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -141,6 +141,10 @@ test-slow = [ "ruff check .", "ruff format .", "pytest -p no:randomly -n logical --numprocesses=logical --doctest-modules tests altair tools -m \"slow\" {args}" ] +test-datasets = [ + "ruff check .", "ruff format .", + "pytest -p no:randomly -n logical tests -k test_datasets -m \"\" {args}" +] [tool.hatch.envs.hatch-test] # https://hatch.pypa.io/latest/tutorials/testing/overview/ diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 6d349dc9b..fa2543ced 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -49,7 +49,7 @@ ], ) -datasets_debug: pytest.MarkDecorator = slow(pytest.mark.datasets_debug) +datasets_debug: pytest.MarkDecorator = pytest.mark.datasets_debug() """ Custom ``pytest.mark`` decorator. @@ -345,6 +345,7 @@ def test_pyarrow_read_json( data(dataset, ".json") +@slow @datasets_debug @pytest.mark.parametrize("name", get_args(Dataset)) def test_all_datasets( From 3f5a805b34d22727e93d4eb4dad27874e68461f0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 15:25:56 +0000 Subject: [PATCH 103/137] test: Add support for overrides in `test_all_datasets` https://github.com/vega/vega-datasets/issues/627 --- tests/test_datasets.py | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index fa2543ced..fc61caf8c 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -4,7 +4,7 @@ import sys from functools import partial from importlib.util import find_spec -from typing import TYPE_CHECKING, Any, cast, get_args +from typing import TYPE_CHECKING, Any, TypedDict, cast, get_args from urllib.error import URLError import pytest @@ -12,10 +12,11 @@ from narwhals.stable import v1 as nw from altair.datasets import Loader -from altair.datasets._typing import Dataset +from altair.datasets._typing import Dataset, Extension, Version from tests import skip_requires_pyarrow, slow if TYPE_CHECKING: + from collections.abc import Iterator, Mapping from pathlib import Path from typing import Literal @@ -23,6 +24,7 @@ from _pytest.mark.structures import ParameterSet from altair.datasets._readers import _Backend, _Polars + from tests import MarksType CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" @@ -345,14 +347,43 @@ def test_pyarrow_read_json( data(dataset, ".json") +class DatasetSpec(TypedDict, total=False): + """Exceptional cases which cannot rely on defaults.""" + + suffix: Extension + tag: Version + marks: MarksType + + +def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[ParameterSet]: + """https://github.com/vega/vega-datasets/issues/627.""" + names: tuple[Dataset, ...] = get_args(Dataset) + args: tuple[Dataset, Extension | None, Version | None] + for name in names: + marks: MarksType = () + if name in overrides: + el = overrides[name] + args = name, el.get("suffix"), el.get("tag") + marks = el.get("marks", ()) + else: + args = name, None, None + yield pytest.param(*args, marks=marks) + + @slow @datasets_debug -@pytest.mark.parametrize("name", get_args(Dataset)) +@pytest.mark.parametrize( + ("name", "suffix", "tag"), + list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.9.0")})), +) def test_all_datasets( - name: Dataset, polars_loader: Loader[pl.DataFrame, pl.LazyFrame] + polars_loader: Loader[pl.DataFrame, pl.LazyFrame], + name: Dataset, + suffix: Extension, + tag: Version, ) -> None: """Ensure all annotated datasets can be loaded with the most reliable backend.""" - frame = polars_loader(name) + frame = polars_loader(name, suffix, tag=tag) assert is_polars_dataframe(frame) From 4fc84469c4d69331bd4c1f5bf30c63b396c99b4d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 17:54:43 +0000 Subject: [PATCH 104/137] test: Adds `test_metadata_columns` --- tests/test_datasets.py | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index fc61caf8c..205a0d958 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -4,7 +4,7 @@ import sys from functools import partial from importlib.util import find_spec -from typing import TYPE_CHECKING, Any, TypedDict, cast, get_args +from typing import TYPE_CHECKING, Any, cast, get_args from urllib.error import URLError import pytest @@ -12,9 +12,15 @@ from narwhals.stable import v1 as nw from altair.datasets import Loader -from altair.datasets._typing import Dataset, Extension, Version +from altair.datasets._readers import _METADATA +from altair.datasets._typing import Dataset, Extension, Metadata, Version from tests import skip_requires_pyarrow, slow +if sys.version_info >= (3, 14): + from typing import TypedDict +else: + from typing_extensions import TypedDict + if TYPE_CHECKING: from collections.abc import Iterator, Mapping from pathlib import Path @@ -73,6 +79,26 @@ def polars_loader( return data +@pytest.fixture +def metadata_columns() -> frozenset[str]: + """ + Returns all defined keys ``Metadata`` (``TypedDict``). + + Note + ---- + - ``# type: ignore``(s) are to fix a false positive. + - Should be recognised by this stub `typing_extensions.pyi`_ + + .. _typing_extensions.pyi: + https://github.com/python/typeshed/blob/51d0f0194c27347ab7d0083bd7b11210a09fef75/stdlib/typing_extensions.pyi#L222-L229 + """ + return Metadata.__required_keys__.union( + Metadata.__optional_keys__, + Metadata.__readonly_keys__, # type: ignore[attr-defined] + Metadata.__mutable_keys__, # type: ignore[attr-defined] + ) + + @backends def test_loader_with_backend(backend: _Backend) -> None: data = Loader.with_backend(backend) @@ -428,3 +454,13 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - assert len(tuple(tmp_path.iterdir())) == 4 assert_frame_equal(frame, frame_from_cache) + + +@backends +def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None: + """Ensure all backends will query the same column names.""" + data = Loader.with_backend(backend) + fn = data._reader.scan_fn(_METADATA) + native = fn(_METADATA) + schema_columns = nw.from_native(native).lazy().collect().columns + assert set(schema_columns) == metadata_columns From 9e9deeb95668d2c4e7d30311e85a8f9f6acdc88c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 13 Nov 2024 18:13:52 +0000 Subject: [PATCH 105/137] fix: Warn instead of raise for hit rate limit There should be enough handling elsewhere to stop requesting https://github.com/vega/altair/actions/runs/11823002117/job/32941324941#step:8:102 --- tools/datasets/github.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/datasets/github.py b/tools/datasets/github.py index 921fdfc75..6f55c1d52 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -311,7 +311,11 @@ def url(self) -> GitHubUrl: def rate_limit(self, *, strict: bool = False) -> ParsedRateLimit: limit = self.parse.rate_limit(self.req.rate_limit()) if strict and limit["is_limited"]: - raise NotImplementedError(limit) + warnings.warn( + f"Reached rate limit:\n{limit!r}\n\n" + f"Try setting environment variable {self.req._ENV_VAR!r}", + stacklevel=2, + ) return limit def delay(self, rate_limit: ParsedRateLimit | None = None, /) -> float: From fa5bea8b25f55cc5bba32c1ae8963a89f66481ee Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:33:13 +0000 Subject: [PATCH 106/137] feat: Update for `v2.11.0` https://github.com/vega/vega-datasets/releases/tag/v2.11.0 Includes support for `.parquet` following: - https://github.com/vega/vega-datasets/pull/628 - https://github.com/vega/vega-datasets/issues/627 --- altair/datasets/_metadata/metadata.parquet | Bin 18921 -> 18777 bytes altair/datasets/_readers.py | 19 ++++++++++------ altair/datasets/_typing.py | 24 ++++++++++++++++++--- pyproject.toml | 2 ++ tests/test_datasets.py | 2 +- tools/datasets/__init__.py | 13 ++++++++--- tools/datasets/_metadata/tags.parquet | Bin 6247 -> 6290 bytes tools/datasets/_metadata/tags_npm.parquet | Bin 2597 -> 2599 bytes tools/datasets/github.py | 2 +- 9 files changed, 47 insertions(+), 15 deletions(-) diff --git a/altair/datasets/_metadata/metadata.parquet b/altair/datasets/_metadata/metadata.parquet index d47c4ebed0528df5c68dedf307f03f66fec5e63f..3eaa28ca39d5ab0230c23b9bb1799d78ffd64eb4 100644 GIT binary patch delta 13381 zcmb8W2V7H4_cfY?00}je(2>x4?;0qnVO002O$01$trsRYCTL>&R6m01mjeUTMLye6Y35&ia}7O(t!*L<&9kAKC% zi|%1=Z~}wDILK~K1vW}T)!xG}Wi|?e3cEHNkv!t6l>}aiE01$rRcq%wr9Y&+J^C4@ zSmLS9M+XFUhv^A1z~ajsdq3SvE!Hz=pw)4tYV2i73^4Foc>YAj6ZT;FV!a^-s^qY)1W@FwAvljeJJxs`|s zPj(e*qT=$+c>#Na1}mXl?65Tw3^2y7ZH(ib_9TfMMde%-7X6M8_2RS|Q85Dwb$!B2 zTM`9zE@gkYl4~Gv5}gf>t;P5Y%Y>)JFX67@z|S0x&DE3Sfe_pRMr+i9U0))ONh ze3ICgy;VIO5xUlQN&*vBO9`Xrz+33Tha}{MQHCl`Ms=#%nZ#Jk07fF{My+Nf>S2Qj zI~*1t4X?9|U491gJk!$Sk}EdELOh~8EK!8Qk_8L`A698?_}_%-&2;+lBdgyohNzv^ z|HzR74C!eBQ=OD7-a-NS%^4y#$mm*hCoU}{Re)+yhN_+#{SlYWxgHn347@NQINrq{ zg9LyJ2&4gGZ3w{CBKXtB3?PdGFi?h3b;(N5o`8x|Fq}Gf-fJHDvOh1!umKg9ezNg7 zewmO*29d#jEnP{n+S|X0dGuXPNVEgu{Lb^_s~`YOXNCueL)+ppp^6Vo`PY-Wj2YWaX6}DSDxKu?Tj%u1~|Mmh7hL6y|Qwg=<<=C2kI>nBA0taxg z-eN(Zy_4s%DjsFD++vwyhYteYhC!d;W;TAH_?FaIl-nTH#5ljU2osv|iQAJtQC%5n4K@Xq!;7#f$ zYxlg85DKa27zhrtm9pLMuo46v=JEZk#&!n+x&H+>v3O>IYMxXQSzBefejKB)4u8Ys zBYH!PlP~kZmx6@>=Lw|@ymA+N9n{3WL&h~GlNJ(}S=o)}>=24qw^t4HXF-E2ebWcey<{x`>oF2l>r6uH|Cp-?W+Bi4`+>-mHjfv@4$y z0nNMKLh7fPv25kNhmFWrB;xKLo{9RYM?O^CX$}p!!`=^ zWP>TI9C=9z=rHOKq_6WT-{D5BOxX0yB&T#dREoyc$Ti4Gx452Ub%Q+_sZ{Rt#aPcO z@7F_{9ak$Yl1{ZuhWl~aY##XRJUNO6eW+^`S8 zRwd8M56jAX&V0C10Q8((Hwm<1zh*mi=&D@G)~>8x2fGit zow2u@reKro^&=`iD-vhn-qBi1Rv_grwM1X<%6~I4s>p zj_Run0<;gmv@80=_Bl=5QwD)>dM0!JlDx(gQGCI_s}pH9 zK4N_3h?-ZaL(=>BL?rPO%R~?@8y;OzHt&?|B$J?pV${W=^c@#3s$wZjT?me@qV-N8 zTxFnUATR45lD2PtcZBz9L9lJ3`p-JDmOkM!PA~RiU0RKrhUkWC({SDM%}{ z*hG(-Et*^G6LK0?b+HcTmN}pnQ2pHgQFyD^;wTwC%w9mTK((=>pMe}ejie3R27+;j zs9qryv~ME3q&pLaS&Q=~T@<*@RPPX4fM2(S2X7Sg_INEr%<%5$#8E)f{7EYJ5E5m$ zFGHcmIUqO@tvCP=MoR;Abar*P>2B-h@SAaykW`;(IZEl#SBMc;`@rzx|7|T`0f>VD z-b#ez_1nIMHw>z~R-Kv4bMm}hL~zl;#QZpE40q9W#Iz_ZlZW9_%>y}7CH_w)kJE)e z;?Rb#|I6fk$ju{ zOK8o{Gg|MDC}{Ccba8;NmubAh_T8T`C&3YS1s7sG4tK>#`Ae zl!dxCB_EmB|NZ_K)J#4XE(M2a)bssw&lPe1)+aeGD%|d|iXizfpGxjBhUT2T_Ql)% z2&?R*F0DYjkdTKPwWN*QRRWQ6>OAnX85wiRyy_oVh(|=r=}h*Mic8AMW}kw1B}#C8 zi%5ciTB>=Y!hSF}d7*(@M!YIOW#||Dg(?MeP@TNrRP3tg32LV=F1c4-*p<}G>($q8 zzn1s@b&q@S@OT((nXUGOr=*`k72iT4X>;{4mee9fza)59STYRSsc$;TxXKp}D5rah zZ5?|LFG(CmKmG9eL7!IRmD}T{P`9b7Coa${j2qN1%CC~m)8-#Vy5FpQM8fl=!*P)3OLU*M}>quWL-|9_q`wU7skpQ`p%#Mlo8-q(vk{ zCf+7AU&O_|RO&r)%=EKi@2$N#Ta*V>v@X4zVr*-THM6-flbG|il3S307F`{cle&oa zz#X6Yj(i!xVnYcxj#|g1j{X=$oYL^wATSHi~Qmz#O^Q{Ngz`f)u!n$P-S`#70$MDMqqRnu=&E1hBCwL2Nw zBQMTtOF^vfh0cELJ2Xt-84mBu8C}0c@T4?o$hSFtdx4Y;z8NF+b9RY$vxnZG>X3J8 z2b06}EI9Cv zwQQC2cbs<9W0M&>&qRN=eOHf-?(%Tk2QAX9{D#hq>e2JyM^d7OF|P z2~c!++{wF6c|RWJdRMgBVtMToC&z@z0jRfD!7;Af5A0H*0F&=L>g zkf(8RXRpVZXpa1+T0;RN|CMWcAy;6~ZTN_?^h3T} zovIkRli+%0eSb}iTbs3rh1=-vKhn*5xz153IZ*Dq5=4&wC42+xjh|gZkk9aDT9rvfv9jhdik><+}9Q$sMMvHzJAS8~-S}Nv~0?qU*8p%9i52kaOlj2%`u1 z-aUcQBD1MnSJ@e#T45hlV0;r+)xFsWFQ~UJ(JLz=m0%l58RW;xJ3^oS3w4LOkat^0 z3fIjZ5_o41nUD&{Z%yK!Utf%QT-+OWWp;USz!nfewW@f0^(H+< z)?)X<<5fsU1bDo!VF;=s*VT0cc8Eg`b_q8y8#ceg{1wG_)yBXKGEJ0wRiS{|sVfeu zKzmKn{Wb<{^F(DE$p!Z;1cd{ua>t~ia-uE$S0RfWg2hn+yw@ib*q%cR#1jE-V(|S| zh(aWL>0=3nW%VK=-{^XruAC?LlWyD)d%xPKsMp{KD;LN8M<(LmQA$=WlztFF?S$Zyh$-p-y(pzWQwZ;D{nJ7kIvWeq2j`x1!_yHq37RPQ( z0LuRs$E2Xd3tvcpHF^yTVWHm;*5-nbs^m_%O6mLugr)yL_%jy5Ilm#yF`lqi`qD|F zpz=Q;T=2g`*!BX#+P@);zTNMwf8kqA9YLM=i;8u|IlUX7W_8;KnD9|Ms5XU5$UYbu z{!au0QU8Hp*bOX#KV=Mmt}_!DNy2+~99+$;?>FLn3EX-gDDkg^+OQZ# zVO=zAgLqM!pBHu1t77ReDTJX$jK9cl_eV-XC1sf~tuk|NSQsBtjlQ~oF{$pyRp-XU6HHn4rVKcm3A`16Fx#cYgdV)xVmSiZvu8m7H z&vWXiCdFv%(ItveSQsyb6>x5DsjuyfT%tHj<$o;AWRn_L*P6-aa?nwQ? zbaWd0^}Q=8D%ZIh0eE|$IFy1dcfo3%MOaW4zYsTy8q^Z=k`#8gl}^YGR7_MvuUj_f zm`qh-)OCUjW-NH+8^hce3~Xc_1V{#iN03$dz9Z5m!0j||f;jh<99-|ku`y%&6^3WV zP(D;o4PjJBa{kV>v(-6QnN5sI5RXK}bHl-ZTs*LkbKoEE?o5KJoQh-G`x*+np$5SM z5`3|*M+4Eh$SxBW&!O;8Z8*u_y)}e<3lLBmNk~@gMwB|r|I|{^xa{16Jv;|QZ+af= zb~4WI-cWK(EhH>)Ov=?#CVwxPru=vu2#6lV#R9mU41SDzxtj?4yPF6W@V)1YW@3S% ztZ||V0qfI>oMAdLqC)44?q6qK)YR@+5iw_3w)Rt)CxJZ_j<{JFxs`0`8~mC51x4yV<_wFjB_KZ3qm12_IegJVJny8#cAjmOt**n{;=kyqJwN2 zm+4t=zj5e095~_LTaYD2Ott=%!tI(y_e$+C1!K#mz_(xD#|0Vt4a632#h4;7-XYJu zy)B@mX7hS06LA#c+)T}xA+K**n7#HtPmiNBwU`{+Ht%EgHdxbsLHMX@*RZt%m64jZ z&s1J#FtL74beDY3kc1PVpsGCdlB89n&{8>Q@jY#<3XZ?c5Ys#V^=rxg^a_@Dt&Z}K zlPbx-{RD|5KH_+*zv@4DBh-2r`iz^HQ!|@nu69)7F^Hz&MMWwZlZJVeLI7^{qNV!x zaEBO9_A9t&f@Z-Y6@N(`ZLUms;hGmyP|f0VEXM;ceDllMp}`Zd_4f9-3jkovVFn-m z1E>u*`)d$3+kgt}4sbsS4D0n9(=7Xa&>0b<-#&1{*$Y|1a2kn_p{B6=+v2_a~<5ymj^g0A}85SSy>wfG4lCD-H}6u8-%H50n$qa!tiN>iktT zMMQ2ZjIpL752wP-y`5{_UmO~5^GQqAo$+pmIV418w)Dj4+lUVM|VZYGzN!{ z(Y^*B4?>qfO-4pmiDpqrpyW1St7I47eYFc9(ZV$X89$+`hde$Y+V>2j$LI8dM*QK4IQY0VeTGq(V z>L?zGjfWB|p>qpQ*v;`BiI<#3;rxYU$`z_@tbGY{<+r{Vea?N-Kl(myNx^N{Q(Cp- zsb2qj-+DBv?j^bGrotK`Okq!%C8a#CZ)_+}F;T@_k>eYpaC}Mg29m5dKrd2(h-D}1 z%Kbrb*Qa~et^k3jdK9SwUq>}FSX=_QJxZPu)wbs(j@uzzE%Lpe`nKf^NOW-rNq?#9 zHENJ1Wfymt)VTh>NT7yM%h?O5%Q?gX-I$$vP8|b8X>(?F6Um?fs_7GEj_XdL!x8Ig zaLa`08+z}wHrchE*hTTlWH()R`uml8tBGoIE0D*{lO)$qtKCN!;iaXOZic@|yILhu zYBH&+ruoE8p1EhGEjLuY%&;euK5{;uERB#RurtXI%=qqRnIi7yNAc5l$#9^cLyB`D z(UD{s#SQO>dZqE)Ir)^2%{)b?U9bcr=IBwCvu9hNU*+pbfpn2>49jp7J#A(tAj*z_ zjP`V~E0(4z-s`fW<7i0NQ)sX!;K9W|{~r?yT%2pLQw!_8w3c8&HiH9H$TM9LI5h>OeXYVXWFt$i7B zemXkIHg0hDfozBfiH9Vw?Bq_!%Fyfx2dxV@5aPDsoDa|AWP9~4`p%lOmVU*p4ZuXV}Cx?Cy+OZ;l@ zAz?t;8e2P3X1$8{D*eY9;kKU?6QO!0eV;R2kGT6=S9B_$MA!t%#QUkmI@j!g7W<3h znfdZ#bw7ljXe>ZINxOe_fg)zaW&+jfqPlzceI_au&EIOw-I4vk8$;Id#FCg*R>TW+ zmC2_2-GewN(?ss%yHwq;OMsDt2Nnv$jyjl%^yTQ$#w~N-7;SuDEv1g?VO?Yl*;z;d zTXc%%f}nC<@?xGpv$0%hNAeZt5EVUVCZx9&-sYRI)bZT2XOxTwkf1wMrla4+vx^zR zqklptthrwZ32Kr%wG#`NrZsX=ubzgYT-6z6;|WczOxFY&g*4g?7J1NG2@2o+j3p_@cCyP7}j7L&S<=@CuOoRlB=_T zo50y%s3I~I_7YYHLx5kN0XfZ27TD%vgJsug&)QeIiq21=D3=N2oyE0*DIK+dp5dmi zo}%xUZl%Xs){oN9yY4m)b+JE_d2hUP+MILSE_H4VdAv&|^!PTb@aMTRou;87OFn|T`(@L@H`Vx04;#hT093Tv|F$D9{zN3F;DQyZl6OsDG5 z!Lpk?a?*Wa@yJcdi! zy8PgLa;n_)rHAf@XNIB+X2*6xTTEU%^w}}!_;$rYa%T+cF1Jh9az{p~2BG13_!`{s z#nyB0bfv}NC&g14bmYD_3$G^mKoH>Pcap2S zp-&xma(_u?fy2Ys-}9|q0t&wsptini z>E3_CBiG+v_#<%6ZG~Dlq%{^zuA8u65*pN(QqBp@R%cAk6}gx3^ULI@lj)t0G>no~ zyGg@M5-^gx)5??2o)PzYF}h?135KL?<}7VFUmvV_IR+=4FA&#Ew`n_OHj-2sr@@Ts2HYYwvg75{vSj_`WY2eFup1?A?b0NljTlA4g3F5vQpl(V8(UIn+mX=Ka+2r7 zH~7wg-NACJ@=*S^A5_e{vL2w;5pAR+wu*Dvp*? zjhQ)xf{aqL3yhsi#h>9>G~{V%5_sh52p9@127U`6kBC|%i-=NSR)D;o4`J}!)R^Fz z=QVsS{-jZW&qq}bmCF_2U^AC$#?F5~k1nX>RR$7IfE;rUlz-b&%8DiUuszX>%k zcWEiQjU=R;MynIJ&1q1B`8&OWajc9~ru4J>5)Lb;>oi>PjU4Th4iH4dd_s?T9imXr z%h~XKKkP(u^Y`y@ede;EQtpYf*)COGWQTN;gDk`;r@|-IR(;3f_y~#Eh#Hw$;+K+7 z7b=!p=xdW5jar)=oEIAoy#Nix?FNin=(AC}ep8o57cq3|yZvq1)+v!EcFw{3YAeDj zFP)Srw1i)W-sI0Os>nV{UknxfSwDx8Pi6j+``tRr#h3Ka4$xCCtC9~WG2Ew5-uRKx z`aL5n3t{rl&^`%wVS;@IdpiT>Z}sOM(&91fw>f&81ZkCU%TgV>el46Gz4g%B;$>`l z-|_34{?bvY9hz&Ao$&F_S=+{Aqi5-^(QBnIGOQ`6jY9*suL9;sf1KRu06|;QQ4)i7 zB`vvZfo#dcXS9S{qDfuB$<`mA2a-@*RgmgS&)@Ca@~D5p!@ZVUhOsXzs|>`e*01=U zXmdWTMh*%V0yu7p)7t1LGBW}K?wy+|vv*cS$>PoF7|_Jn0&tiCVgMe%rBwibU^COd z2J=sjx+vkEb84isX{3{rW5x0ROFH~xV@!njbbL0&3dK?BsbPQZ?&ihaE^ozlij0b; z{C3eOPOlIV_VxSg|N9-4&X)Fvb+|7**`VY0{DK$as+dd4= zM`oQzV&f#dbVvGYVi0rG5d~fGjmm_w$yr2(#dA@X29(ZCJSL&A!T~bf%+`ZwuV+`@ zxyf9ay5%ddrJS?ez@URp@>&Pd7n-iWGO#xCBE-mOTKnXkEqzT4ej(ffZdG!9mF`>q zC=ye-`St37MVsW1bJKi&FnEgZ6iEJJQdQEpdSwy}V|)XD&Ko)MT~A?M)I@ z&@@eGD>#nP-(x@0EwkyEu4Ua(xyEFWR)oGDDrNJY2pMHI*#K@>4l6`i#{6-j57G}`yprSVARekSAj_Q3MJvOn|DnN@|61kejcWPH0dKN+~UXJby zYNNzX@4nXa4}eKf#>aDM-u#}^OZuj~xY_w(i7P^wBw@teU~;VZ5Xgq0Xya1ejm!XQ zC8@-FQr>%jgQ@81CDPLeU3&$HCI`uDB43*)D5fxTQEDgJ_Ce@J1c(#86&en2iL_6E z-;gi*X~!ao*VI7c61X_FTg)sg94dvgVAReWsX}G|e1@1MIvvtgP40w8U`h23Fk9Yg zC=^~!i@V*`sIgqs>fyzE@nONFGNC4`NFb*St}xzAO4k!`zKLT=Xr~ zGM&=f)HZL*eka4o4*yv#x}?g(#&SM>6X-Q6k&!Ule3g6v?^2vsU}4U^te;2eA5mqu z!nm)oK9;Or6MD$$!d&}Fh@XWY<1|`rOfseaWuVcN;>GCnHguPvmnDay;!2Xp8^niI zcYz>yPXf}KHeCDEhOUQ=E-=Ubxt6|4aigN4!TtL#@*IVE%~CY8j-{rHF$2*sMkhlJ z>j{SvodbcS?+$5@JZ^J*zN;#~_GnOKr3sN5=ph){To*9OAr61?br;() zB$`7sA}YyOhWeIr)%X|9z@@CiInO33$*%HN$c-xa#Ib%vOw{FZus)Q|R~SY;)q0`v zg<{HDdum+@&$f~)P`KI0&^i5X!#ssvoxxTnSq?6ZH^wbWHDJNceg#xP8Af{|aO3S2 zu$O|cE8^7I(+UbDjC4(u07>Xi8>=9=D@UYk+E~41s>0JL8PsW@Sz(&QS1@;MERdWd z+2RaHSmZp-Iei?ux6tH>?;(n_e=Te8Yo=LLf>Cxn`O8JY2EFO!z%g9_DRJhKp=CmJ zcN%x2Nh4HUlsJYvN4+c>T6gE|Cm}PVH!42Hq{HsyblD=4$i;2r!f^E<`y%nE$A#*f zN$GyY>4uA^kHXn9Y1D<{iNZ687#>b6W@tN1v)=i2GfvTpXoC)d?$trvIjVb--Q$qI zcaq5$k>H!H0mb%VPH(zFW>7bbFgAUO8{3DaROd#42B><|H4nP{T^NG?AbV{fP8|c_%PbT2s6;9WkpQ^ZmEDjdz=##V z0-y%`L-MGbVIv&!F;G6ILbPpJJ(pH*JRt`VH@EdTu8wK zHcyaH)*B|x&+(Hzdbv1;;06s4rkQ$4#qkGin3f(3>;bVppg{N=J1ph=bTNxy*|(Bn zs*1zIhFAGG@9;#jxQ1G0HT zz8s;X-#zD7`y{t}BIj`J-N~+Jhdu3}t1{mK2P<$#%q`o?mX_^1AO^#sRc03{VQbh% z{vZt)@?6>qAr$5eWyIa&u|TAml6;Q$7HBw*anNH^CZi>$Qpk`a)Ca~{zcnYpUDSe! zDH{`{>QfFLW#Jh^TcDH`;{t$VSuzWctxyy@;1z-+CRj{4S!?EpviJHO1B#L{oR3@z zHxJBxxyP z-UThg@*k2ree-?sg+jN*?1$3)(wq7g{L;>Hl9$(%Z$e`#vKbuao31a4i8B`p5y!m) zMq69XnEUOe@^X#Z#!@hGQ6E~+eM7I_LY3QF2sIp+!7%Zm0@pv3Y(9v~G8-99MrZEC zF+X775r8+69CMas$hf7&9sq~ws_yBn`-;kQ_D$*2iT9z`V<%lyD?QE(IhC%&M{sew z?1)x?ecb()N`$W&G`G_ydGq`#r@dmh+Um<0w+VV3j(fA@+mc3ig8WDnMH(019%_}Q zUuK?K-;ncz?x5(*bAJ3wTUa@MJ)|E!qG6Eq6sF&&BLQr3cXyTdGo#MXPhG?jLgu4>XD4DF`#Ocn<_`!nAWZ47&t-eP5_EF8IPFOp)J$~EAa1u&NknvwH!#ZTu- zJpf6S=G3JmBv&jogQxKTlej26yaVaBQSjwf`|nNm-{^S}*8bzh`>}?L2iym>Iu1cg7`AJ#72pBvMu-Wz*cmxC)Lo1Rp_gnDcF8CgEb%*`Fc>{! z<3DV1KkjdWgtb@e$t}*!5|hWlzOi1on_35?95%i?qWBcUZ`BV}c@UL!TEatJ+@K+- zmQWJ?=EL|~Otaiyg|fPno)sjy_1ywF3J0xwA`f9d2pswf$k-m52@RBlmqAL=5d*|z zM|lwY2hnd(K}v2j8pC#tDCX-FLh_FMyW+95{YvF`>f&j{JQKja215tT1~)0fzcl&o z7fUEbg24CyyHOIz;-@l}5W51xf5@-m?~pF@t=~gB?ERgqz4?bp!Puc8I)UA_hPP;O z$WiLBZxj&F`D27ls=TDVA?H*ana$PhoL=RCoFsiBTB(FOhjme|0XvO}&B$91!4HkL zV_dg}8H929*8Tl+xR}a?6O?eSSKJO4(_i0Jo;d7oQN?o`@Kf^NT0_gEo&-yAuoH|$ ziZ|WH7afV+7krR-jY}^aBB4 z5}$;mz6NK}h)?-yc~gdl0?EGv{4=_XvB_6!!nbntjH5NfSR5#4pSZn$!jBzcz1*Q@ zQamM(3l2FK83Ihw4IyN`HF`s=I zZ9yL@D5I0um7B8UYk{Z(Vk#^@&@xa})4+>Bf69{_@E(#s`%Gz z9((}yr+k(Ggm>Zn`Ouny{qXlr*f!y_wio|_gJ|p2!QLJozHoAEM1vhfLu7C3i|$e^ zB}f&ZWuPtKXr*TOzkeD)jiQ>tMqg;Cl70Z;5&(_?I)?v$C-0iXBgt@az*OnKKVV8#ba^f2 z=VgOCp8q!Ze;jA{_cVhU)sxE_FR?W`fdGzgaX^OuHH`W{MNz57F7K0Y4us1AIG&}U zb(P_S{}F0Kdnm*C*0I;nQ4L|Q5y3WCBZq1=nu(timp(TImdT~s*3R*(X4G@o8p p_-$kVA54snD(|v-V;Bft4gyd`%`;+uVZi~R+Bs}`prQ%>e*if<3Wopy delta 13569 zcma)@2RIyW+xE9vomCbq%3{@s=)L#eiD=PVh%SQI)w}3Hh~8^R5D~pcCkc^g(L)d- zTKG2Q|2)t89`EThuk$WDmjc{M1=c%)@O#~M#Eq__Ce3yqT zDlrzPXxU$uIR!sL|Cu>&7IE2Uxlnnv$!J6VTwTt$v2SBXoz&;a+ZlB^>{iNEbvZbn zA+(&$RtO?Sn4UR7Aarm-<(>nKB8mbRpMX_~MSsXgzQpjl@pa)MPB)FyVA%q6k{CBF zFeFY{wpTAHUss=!AgjmIaLz-V0mYF0m>}HZQICQz=Uft7bJN zp6U$JQE95_slSLF4IiUA&eRmcelCCaNHsoSy*t>caJ z3?mykgJlrsf5EW=>IM#(Lj_@;q;c0>hc7Uq(RSw;F%*DDehB;pR{#W!&=7E0g6FIuXtXW@`(8Yld$+fPzaBC-ox^0`HQWYDJ9d zh${6b4lRBm4txptyM`d4)T(*_AuvE60;8Is2m8RnLFO33hzxno$2JO+^44nPa-V|8 zsRT}!9!>CoS{c7~mD^ti7i6wKvT|s}bAD%a@PiP*6zL1V;H6@qj3DHBY5|lMjs3a> z9I6};+YYs)Fn|LHqyb_pARGdJN&`P5(1aHP;n?>aFK+&{BE`vt12UMcnR%1FJ~hKj zYSUi$GS4Yy%wTj0yII`1E@Loz!coe*yF@ok0Hy!~Kx=2$V?v0x8_StA3L`8qm(u^^ zFo@(YUq}p??RM4XWI^Ri3HcgXg-; zE)@m$GCfL8hnZIk8yp4Dl=M4#KMSKV9DlGr6w(_Dqp7EKYB}ekLW4DNm`_!7m1kBO z2lv%xsnO+X&qHmDIpf3pCCxf5{Ny5y&LZ6td$90ur-c;>DNi9I{j8g_=MQh zPMu{TNL>8FBxS+Ha$p3aSn2+ta7-wG$FP5pCasG$lF;Lj`Aw}{R=p-qNfHO*{VH;Jc;;B&rN z@97lCT*r)x`s4YKTIj~O&j;79kW(&8^YwGBb5{6zFPRH`yt=749zFc&lb&psU{OO% z&Z@Uy;SBrCqJA(O0LRL04)HIT;4nB84yD6ZccRozdJ z5VdCNGX`l8AzSJJ0}nAy4=`Dmn}&D*OrETnDhTFG-6EhxhYmRj=Ok`WnCa+aak`nQ^Ty|!vV)HuBQml`YBO!_Ngl^Q7LEQ%!8=nI|nkVKLVh5r&`1Xhes zc@+xR?Ve{J9*=DKwLRU#B_fDYbQ+DEJ_wQrhfZM?Jmar|qf_upekmcX_Q0Waq~hqm zgmW!r(q(L3K*q{LHJlg8L5!~axk}7^{3oW1iOf`z85mM*C96wI65Xqx$)xAKKuUwABQNNq zfu%h&bWok4<3ZG{w@G|Dd8Ob%6DKFur7c$OBBL5n)OyJ~37k^#+Td9?7Q zFV9iSG})0t`rxoc!v`L# zyj*BoV_Y~I&v!VuEPDN&O}#682)_LF zT;`*H0)6d`Qd7OydF!OkM-KxwEut7pr3G{4&PBp2_9C3NinGHMkw$LC-##UE_NYxe z(q}dZn64$}T9yG7`qfMoJ8%d{sAta=dqkJxUW$h$bmNfhKH!}5bSS!yc8%2@b+#mH zK=;*6Dh@Jy(fM@5bgC&O!DsWCSD$DeY;1n`(8FT4iT+DfeMks_iV5E1z!9G@H~9&n zZYyPnwC#5hMCl!)!!39vlzV;D*(CEiWqipfx&-mp=3XsYLCbE<7wiWq)dO>fM)LDV z6*Z=zDg&o|H@CDQ_xsl`(E0Q4E33j{-S&PwT-iw9kqfeOWz>juX6zTx*J~GYlXhrT z8buh0&B%<|@D5YW8$8LZo*m$HIlZ~}T4$e4Wq5^vWvJ4}Z`Ia|rq-vOvU#-pAaSL? zhmf)D2C-a?;z-ixz@dI&ejW=YE-Q=X)aaI!v^3}DA=~{%rToJ*1+;u+HnNc5CC^Nu z4qg4qQOO4%KGC-CV|MpLBPqt>IeI68qRCv{OrQW+*g;*aDLHehmO#}>6RU1%VGZ{> zuE?7nZ-+XC4f2Pb6Kj1GCM}O$lBJuxT_(3Ig^G90XXE<5hHKn|6B^~#UB!j4ap;b` z@puKX9fl9xZ+urmjgO|dSt!EN%TEpGWySlHdrc3WZy))%2>Fb)=Ji#1EEDPwwN-Xf z{#3ZkY3zmeojV&b_LGlTp}Eo7<}~+o($YNi#O%)=U7Gy3+rUlU-5icER-3|i{QfOj z-E?j^GPSXwNveXC*qBfdg%JDvLhS2;Rxb4f=M&*nqsILOn;s-O{RG0qPu8a#n3&%b zfzPr&Bz8yrbv#+?1;RjNhe}ZnSe3aJTJ9t}B!0=usa&PYba((6>R`{*Y+|U4OU_z>N=v_4@c#x>WmjQ z@R+--gZ#c-4~p}=)7fd{r_Q#vcx9v)lI%wF@bX8)tW#LU>U~D8GCmx$Nhv#V(;?rz zsAvKog`D7>eYnpuamDmWy0f3r2&sj%lq6g7oh_kJ7Of=o?bqd`LtPsw<#A}%At@G+ z@s;A$S#Pzv_lx%rmPXgx^NW3ICoB)-SPNIQPEncHrxkY}iIW@@_~OP9Ms$bCFt!iH zcbnCHoVR^Db7p>DsZU_nt-i6tW{qu8D)A;GpCj$%$!%qunnE890x~YM5YRKdPeLf` zGp1eED$wC>>R0kz^qdIlonhv4D}&&BqT#95%&dy zO>SCs4?X>`!gun*Xl2LmTrpeyw5G3Ct!<;~E;Z3#rKhY{_QNhH--U`$6GY)sB+8Us zC>mMZUH6XV)fO)h*;a@Rjg%vpd23T-VdH2<_mtqLWa(m&5n8A2RvnjOsi@mL^`5&% z3?$8WXEYhsP)Gj0w~pvu+sy0c^Rh#F#bx=Pi7~FI{5K5JiY=0NPK_D1J+~@?>}oZ& z_^E3(#?6)_52w$YH4{n~*a)fnBQj--K#>g0egn~-f;LRwmt$gY+w^OhtOO&7atf=b z4jyi}lyJ4gGO3;&fectYSk3f4_|j+|br5Rb5xiM#-FTMQ9r}>T*O*Guu&y!d%G5U^i=n!*kGW2+iSf45O)sqL{|>LCNbj`zQNDQVo4 zV#72I2ZPjB5KF2v0U`4kn7P28pEeN}#jK{!HO$&rG3Sy0l#~NSo2*_+!BrC5keWfj zz_fw&T#|_2o(ok1UrU*raTV1a$E74sP-UIQPuEvUm1AT;haO^7IiAKxeg?*o~SQh&QOHGNY(o-D0V%I#CMxdQW!;JnvstY!0iGy0EZd)0f}f#7?P{JW@wKwAH6 zV_?wR2ePQjckF-K-o?)g&H0~XbIu&Tj8vTcYuw6Pei^qVH&Gd@dW{@-2QN}L7gw>+ z<)QKWBs>XtvpK&haU1c5Urgi5kKg{S(y&Gc{iiWLRH{3^K}&A^*S!_4{Bmz0&C6K# z#^Y)H%e^&N$>xibepY;YW>;OyMCQlEb?+tC#O;tf_OXz|Xkg`IJ)(Lqq_gCu^6sjJ z)5n@&4IOkL@{ge-!ZX~k&M7y|8y5hC+gT`jb#4Y!3 zX@91D$WI6J(|>6vEaXtW5k_B8X!FbYiC$;u0a~-Cf0#|?;>wup{N?{mlpe&}oQ9uqa=r5v3R25zb&g=I-glfoql;;p^QRkh@-~)(NqtWT(C6q%cS%h|CAC zzsG{EUzErgBgc>#+!)<*4ZKPfl9~_1|dL z^S|Tk%{+|7)#MMZS-M;q8vI3b9-04vtIHo;*M2pT{@|*a1AoJsHR&p<`5(9n{lWDc z7T1hFxMFM2TFQu%FkR^eg^ro%b;+!!I&ba%E3O=*Lo3xLJVWsghj#ygYU=+3RbV9+ zRlTj0KMkwi+t18X5TmJEr&|WN|G_d?(1rf4zo(KF=dCsbe^j^R4EPZ|NU+pd3wSp(n>W{Iva9mK4p86Tv6tOh zny6KMo|sI;gCK#990hgH{6h2xJ%wYREy4mz>0z&znuDNK>4pM*MYj3BDCN@`n@O%5 zN}qiG3sP!&i&A@Oj89%nk!=Ngr%D zL@+P^^|oW!}%bT-`h;w4a#iqnfskk*CAz8j;LPL%Lqyn<*5(Z0KMexhy4;FGP!mmB^@y4(47`0Ra(jaIh&gk_^6p^E;&uF*BkS zz&8j1Y-uu_N!RJG&;yl~u@Z?m;WO02vlqbwQyfGqkq1wXOK3Tsk*&Q0)h!H1f`kBq z*gu}`{(7ashWBS;J$Df@rWU3(UM&7-P9eG{hNx8KjMSLv>EL;2d` za_B4P7DmA}>CHWE?oVHDk(1i0-`Z(=$IPG;#&ze+!inAu-M*U}t`T$vvrTX%U_(3l z$9dqgA$64n?fh_C_;bw^1DECzQL)evM>{W3RF{9XR*?JnF*r%3MD_*i%9VuZaPon~0 zYL^1VuRhT!a;gSgHcS-oSHl^?zv`X^|&tT3WmW#^q`(kfK9PD&THFm(Bd^nGOD5zdnTZ$KGTE8 zyYMP1yC??7GA(Q}fL{RNmw);UI9Qp{!7spol_)yKBq`|CuSKDqVaRat?NtYOie4Sf z@w4D4YAP?uv~&AzJ;BZdx!W@B9+1TYbe-WQj{uJdkA$?0w!&p{$EwV!!DV{K8u~%P zgWS`*mhP81=#9No^kqJJ^Cq^%RE*woZ++OB(3;z7(Gmd0mLKrDhlA=@7YEqK+VM{Y zt0th@gs^2iN<}JC6JyIdTSE(Pu=`!L&sj6kt{Py#b9#WVv;R%ZU2y?k@b5L8$}KCU zP}BLX>xWNM8Q!sk*N}Y8dKAg^bbc=ro@e2kmMwzZ0B!&~0I7jkV8-`n=+qmHtjkf# zrS)e&KYrs>x+(TyPw#GNY7JQ>N|EpIO@pnZ_U$G!;zb^XwVeZZzZ$3HcT(D7L}+Bv zAO*r7^|n`rp0OxG-xxN0W7Rva_PPBk&A2iVNY3Mjj1g@H|uVMWiC8)bvM;-yacso90e(t?6bv5f?bN3sM@6F~kB=lbr-xt}$w( zn5~x-p&3^w=Xu>(hv!@TrgO8*#c1DgsJk!)NqhX(HE%i7n&&ri_cm#*^*?aGK!e>| z_s7iBg>Q1#OFg#!iR*mDZ$_NeE*yKuzB%^OX;rGrf@D+lTXr|`cWhtV0_5q`j1?*$ zNWG}6@3$Xrl$+gBxA)E&le)LnEk$zI>n&4(3K{1QcjKTG1`&gdqBZC{%UL=!0Wmdx zWGJC>L%tqn(43CVvP+jcdst4scZ74GGuG7mu(7)E?TD>5EtRERLRDlP$=vY3{pjJW z$N>1L;pKQ}hm-G1ZvpcTHQyF7t{+mBunNf)mluahw| z)V9(KMps!LTgh3?r4Ay!XQaqA+sXwT`EF~JDk+OJGci?~-*T!_g#mSIL#KW4x$|ym z-FI#Wtdxq|g7p-tz2w55*@WufUVdFo%jcX4$Ct~$s8-F+nT+iUwk<^)6*}F0LEeb( zSmV#8H0!dWDr@$_SN~ogWwBEzPA#pbF!e1`wYIg&)wRTz-LmUbZ^KhGEc)jB@m_%j zQKd@GZx*;7(+wqDsHI80T)Rq7ymq~lZ|5?mg;d^Bv#kEDp+2s?HwCuudGnOT)7dgj zc$mk~k~@QE#dRkyHg0VQJ%N8g8pu~)KA%m~z896C_FdXPu_-;^VG1)f%d5wcFVQVE z#>(iO`LVJVixHb>N5Mc>;=oVrP~Ee$f{3&?)iI{SuPse7_KF5WBYVQ~OdIu%E+$1M zcgg1PiWE%tx~G>;-ZRALE@Li*B~pm53(|_|Zcx%BCHcm8O)(khpaDv>lG_eKgfrrZ zGlDgarDGjQN#zGUG;OG!Hn(lA8*w(Fs07^5UT^L0mx9)nAE`7_zr_M*rp1-i8jA-e zPq!d=M)3py6IBD#+v>7f3ZguGn6C1D1h}3}SuLA{JURtZ1Yr@Cli#<6T!#<=FM$c9 zB*qaQy#9(ojDr|-5%@|B`i{AKTuAV1d{pF;FQQlp*|Skgf6Sy^p*onw zV?=9ylErnq4eep@eVn|^l#Di@((prmY&hJ)P)HXMZuo2pUy&F<9*%>zp?6>p&t;ZKCFQAo(B@gayJ*KpvXMbP^7@gm%-iBrOW25S7%IgdVi0 z-Xm@kD3(9eoQTdHSoO-@Y9%p#F`?5_^)nk|h)(C;Pn_c7AK53bw|4?_f5U$$7f_@C z=T)&KH_=z^ZStf6r{uO`3VZH}dgCSBdy2O~+O>s5PQ$fX)Ksd!9f8;$WCNdM_+#p2 z({0J_QQQdeQ;X=lE=TKiNbiC-6|=2Qqy9d3N$kF#Q16T)4TBlkv@iX|nS36!?pO$@ zkG8jc6mzMG`l>(i{hV^|#gXz_Fyl{De0erm2K!A8_~>xduA?&7qjs*(C!5!9XUdM8d4z^JCO8j7)q# z9cVe2GczO<*>MWp`DYi+kAqCy%v4yobkXv``gT`r4RxMpyF1(H8OIT{*7|YOHJ8iK zkAWv0Yu~xMA1^x!F6WI07VUEu9FRnu3JrJMwLioiy5XyK@795}OLG)DMW>pnjRDXQ zMGnt*BU!Qztag8=UZZ9#XeWSw%g7vTWMl-#K0QP1|3AxPobY`~B#Ck;+F4RF%!3A3 znJ<%Dt80lnO&FkTEY!4&8^fFG)~{!ZIK+AE_M5tl-B#b}$vN7wYf$0VN0*G`k}z5;(G z`yPR&Us2ut5=S=O$=Yqo!N<#-k50(XNNKkZSB(PHq z7Mq|9*#gu>b0)}e@;_}@RX(^36Qjy)ZZ|uJA%=?yFd%`?k4Uai=Y9Bm$C^K1S9N56 zPPPhEKA{M3+`7JY>gM~AFlD&qY7CH_cIcoKc!kfGJ zDFbj$Jxq}HzE9Vjd~VP&j#yJzzp!RsPV?}RRe^^D&Idj|M!$IB5;(Kv`igiaf1qgx zV63Y-HHrH+-E^iT!t(-E^U zr^U;5;#;_dOerlRs^d0BjTs**!D}Jqu?97z;*@dy<$gP8oAU|_0!~qQelAz(x&(Fk zjI$|=uhZeCu8W19HMuwm4p{)Mk*o{AXfyQGP;&d5A=9l+g{mlL(6I~i6ur?fK>b`` z?#pKVAa52SbLppA!_qI3Gz>lMw9Wl`Ec$7BST{kK-3{f26*Fr^sV^U zlw#`lw>${PXjR<{o=C;J-?};qKBuI=#hf@mLcB3H6&b#I>KTsv=25hOA4QPpb%gO%PuSL& zzgRZT%usg{&yF7GX}o!)Vei=h?#T2y-SaR#A}K<{w~U2B5{@JkfkLm{knf_7fRoe@ z{qC`15E^&6{WAgVbTqUHwrKIB?r&=qR?7M&cqACZ1@(5(R(--X*BD1G z?&M^5`SCLcsH1weHzWm4X=?9;fmkj0r|@eDCc-9rV6i4QFEIx<=@eLph)zBrDHq7A z@Dyx0@>os~BA(O4pdn~Vgi=xd3~_o~&KJQnR|qm=nohy9w^#f#AKFPQcZ+S3+WKaU zrdEEM)irO_cakos`FRS6nvJ%N0u70>nWT9XjcUI1MoKPQex=Fh5YiI0cS2AjG`bpS zpcqDTN;sPjq%}Hlr++;D^CSQ&aGU{yv!{JqO_S2Um^%`eU63Xxf$VW(S`#mIz3Cd( z+?URHm$cgu;*Unql&GCWS;2>;u6d;#5k;qkQcW|0C%t3?U)_&l-Y!_Z_A_=@WmFN9Z&Rh}ZeVH>=+cjCBu*3i3S~uJ2y~OO?=gb0$yKyh&mK4l@X{sIkec z3_G=WNp*#chhAc%dwC5hvI=wGrCnto!JZA?(n(ZzbIE@9PS;rVdz504gw41ro+#+E zbFyp(7XuTcuBmKRxgkK9*9CFGL2P&NS;IHIgovIh7U9bAs@n#p%QG^lTp#E0C12Ex z*gkpX%#8$XOH4(8M{q6^bL!ns`!zG<=tV1+|68VQ~rqGE)-=Dgjg9Ni16;+Z{UAv*OWp z@8Ux1L73A)(_Y%HS@sKVp-0b2Z~^#V6PyOEp^Xbl90cws&Y75KnS2TrS#6_v;S%1B)|3W_SIG6h?NU^Z%p7Ex8j`kN*V)D=*WC77HseVJ0!9pw0DdP zZr-c*QE8i+elt{ZFnx-a2wx?n{It8`i4wketJp~Aa85n!&KXu9buCSU&Ag`s%$-B# z%a9j|AzU;li|lygu~u^z&r9NMfX)_9rAe0KOIJhjU7}TnIcNZF@8vYOc$*YGW!Ozu z*6sPg!vR`4@?7@&!7cu+Fl9<`OWvd*ow<97EP!g7l_I}l3dqiWw&?Dx-1HHZMDcSm zkRCt}U;Qg{}f1 z1UZ59unbXwiOO5y03pCL`Yl1WxB(^dgS~FJ;z2o`sI<9VxE-o_5tcZHizG*f*aL=w zCk;FjpqQ@Ps(^<-<&Q$2irZv)ZR103BZ|*(%-0mWyIZ2AsbQ-M7g~OY~aF)%?yC?<|v;_3SQ><3+!(&L`f>`L$38 zwHgx)XSt)RlF%xakX8FLXod%!AZSYz60#kdJy;W*oP^vEylnF|8ZTKiqb529x98&X zW;_F&7l?1N@qdz{li$k-C}*-k^7in@v?9~PH{GLqGi`d>A$N+qD^yP=IH>Ka_z!N% zH*TI#%TeJ&o4@LxHHjX+f*2_?GnTxP;J^8nIMzt1+e~|<)7;Yc&R{wA`L8!6ef57y z6LSlvnCn=ws*Pg*e#mA>@BXw$&1HSaqNW8av8* znM@#(Jvt#ZwHyM4)sykT`%K`zqodbiwEVZ0#Cm|PF z_up9b`#p5nI7q(s>~ZRhIS)}TvTXLg?g|j0}pu{geygxIe^O$P~jpV(fwH3wK`CBy#8Mn#r?{W9dC=_iu7)yT) z1^ui@^4gbgK(lkf1{RPmXy?y^eM2;oM=-UT?|Z@KA8D%3p;~Y=0Dtt;E)jweya`b! zclNAtAAfWSO}Z{Zw%4aHJ@A{F8K~s`gxp7_4dUa7kUEnGo8=HEzI)=u%o>FuYIlM1 z5j-bw-ero!YN+NRw;MPJ_uX<0yIg|_g6_-QKL0JLY?OTp*Ow|jdH_2ji+v0j7-h)sbF z+1;>y1zmC?t2j}T<4-xCB*IRnnBgy)DGy^lhAa`!lbr6Sw@xIRe$_}m|KyRAl^3Nk z;e*Z=P5Bz%20QMZ9h^Ynk2t6vvhFgpX^(Z);i}GtE^?4{Etrz)wjJmc~8>z$Cs#k9_-=r zNJKSA_ptX9$;L?#d#ADR_Xf+1jn5^90!q~BkfL9Ehh92lm>LQ zPZAi}!w&NChCh|18b0YGoAe|1rkmg;X#+Z4qO^K?3MlL(J>{jCZrfE{3kLS-aRfkuB*xu4)em@k(8r4Aq!#9=C`rThutIRvAYPW0;A-i*-7!Ro?~!U_>M~#=fd#`O5rsd5!l&9eImz#cc5U@mHtZ`)&{XZ*ft2SP@g6W%HnJLq^EP zduw1hE3>m9uPGjO;Dr@{odJsL>*adqo`>6? zp_j~hTB>gjY%6KM$FNeZWffD^*c2O~GF3qIb7Cs&gi-_@)ebd02lRLQk`;dO&*#U2 z@D^x^SJgWGk KMp7kp$o~QO3ASPY diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 9b0e7007c..cd9ef157f 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -33,6 +33,8 @@ import narwhals.stable.v1 as nw from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT +from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read + if TYPE_CHECKING: import json # noqa: F401 import sys @@ -257,6 +259,7 @@ def __init__(self, name: _Pandas, /) -> None: ".json": pd.read_json, ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t"), ".arrow": pd.read_feather, + ".parquet": pd.read_parquet, } self._scan_fn = {".parquet": pd.read_parquet} @@ -274,6 +277,7 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: ".json": partial["pd.DataFrame"](pd.read_json, dtype_backend=_pa), ".tsv": partial["pd.DataFrame"](pd.read_csv, sep="\t", dtype_backend=_pa), ".arrow": partial(pd.read_feather, dtype_backend=_pa), + ".parquet": partial(pd.read_parquet, dtype_backend=_pa), } self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)} @@ -288,6 +292,7 @@ def __init__(self, name: _Polars, /) -> None: ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t"), ".arrow": pl.read_ipc, + ".parquet": pl.read_parquet, } self._scan_fn = {".parquet": pl.scan_parquet} @@ -304,6 +309,7 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True), ".arrow": partial(pl.read_ipc, use_pyarrow=True), + ".parquet": partial(pl.read_parquet, use_pyarrow=True), } self._scan_fn = {".parquet": pl.scan_parquet} @@ -378,6 +384,7 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table: ".json": pa_read_json, ".tsv": partial(pa_read_csv, parse_options=tab_sep), ".arrow": pa_read_feather, + ".parquet": pa_read_parquet, } self._scan_fn = {".parquet": pa_read_parquet} @@ -401,17 +408,19 @@ def validate_constraints( name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / ) -> Metadata: constraints: Metadata = {} - suffixes = ".csv", ".json", ".tsv", ".arrow" if tag is not None: constraints["tag"] = tag - if name.endswith(suffixes): + if name.endswith(EXTENSION_SUFFIXES): fp = Path(name) constraints["dataset_name"] = fp.stem constraints["suffix"] = fp.suffix return constraints elif suffix is not None: if not is_ext_read(suffix): - msg = f"Expected 'suffix' to be one of {suffixes!r},\nbut got: {suffix!r}" + msg = ( + f"Expected 'suffix' to be one of {EXTENSION_SUFFIXES!r},\n" + f"but got: {suffix!r}" + ) raise TypeError(msg) else: constraints["suffix"] = suffix @@ -432,10 +441,6 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: return suffix == ".parquet" -def is_ext_read(suffix: Any) -> TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow"} - - @overload def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index e9546d2b1..cdaa57322 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -4,20 +4,32 @@ from __future__ import annotations import sys -from typing import Literal +from typing import Any, Literal if sys.version_info >= (3, 14): from typing import TypedDict else: from typing_extensions import TypedDict +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs + if sys.version_info >= (3, 10): from typing import TypeAlias else: from typing_extensions import TypeAlias -__all__ = ["Dataset", "Extension", "Metadata", "Version"] +__all__ = [ + "EXTENSION_SUFFIXES", + "Dataset", + "Extension", + "Metadata", + "Version", + "is_ext_read", +] Dataset: TypeAlias = Literal[ "airports", @@ -96,6 +108,7 @@ "zipcodes", ] Version: TypeAlias = Literal[ + "v2.11.0", "v2.10.0", "v2.9.0", "v2.8.1", @@ -140,7 +153,12 @@ "v1.7.0", "v1.5.0", ] -Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] +Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"] +EXTENSION_SUFFIXES = (".csv", ".json", ".tsv", ".arrow", ".parquet") + + +def is_ext_read(suffix: Any) -> TypeIs[Extension]: + return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"} class Metadata(TypedDict, total=False): diff --git a/pyproject.toml b/pyproject.toml index a3f99b7e9..43370cf7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -250,6 +250,8 @@ extend-safe-fixes=[ "ANN204", # unnecessary-dict-comprehension-for-iterable "C420", + # unnecessary-literal-set + "C405" ] # https://docs.astral.sh/ruff/preview/#using-rules-that-are-in-preview diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 205a0d958..e325147b2 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -400,7 +400,7 @@ def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[Parame @datasets_debug @pytest.mark.parametrize( ("name", "suffix", "tag"), - list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.9.0")})), + list(_dataset_params({"flights-3m": DatasetSpec(tag="v2.11.0")})), ) def test_all_datasets( polars_loader: Loader[pl.DataFrame, pl.LazyFrame], diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index c8e67c394..3702028ac 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -193,6 +193,9 @@ def generate_typing(self, output: Path, /) -> None: NAME = "Dataset" TAG = "Version" EXT = "Extension" + EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet" + EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES" + EXTENSION_GUARD = "is_ext_read" METADATA_TD = "Metadata" DESCRIPTION_DEFAULT = "_description_" NOTE_SEP = f"\n\n{indent * 2}" f".. note::\n{indent * 3}" @@ -276,14 +279,18 @@ def generate_typing(self, output: Path, /) -> None: f"{HEADER_COMMENT}", "from __future__ import annotations\n", "import sys", - "from typing import Literal, TYPE_CHECKING", + "from typing import Any, Literal, TYPE_CHECKING", utils.import_typing_extensions((3, 14), "TypedDict"), + utils.import_typing_extensions((3, 13), "TypeIs"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, TAG, EXT, METADATA_TD]}\n\n" + f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n" f"{NAME}: TypeAlias = {utils.spell_literal(names)}", f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", - f'{EXT}: TypeAlias = {utils.spell_literal([".csv", ".json", ".tsv", ".arrow"])}', + f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}", + f"{EXTENSION_SUFFIXES} = {EXTENSION_TYPES!r}", + f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n" + f"{indent}return suffix in set({EXTENSION_TYPES!r})\n", UNIVERSAL_TYPED_DICT.format( name=METADATA_TD, metaclass_kwds=", total=False", diff --git a/tools/datasets/_metadata/tags.parquet b/tools/datasets/_metadata/tags.parquet index b932af7c5de7eaa7decace6422fd8191fcecb3c0..f8ed6f54e46e03902d48eed24ad595faeeae94fc 100644 GIT binary patch delta 2269 zcmai#2UHX37RP5uXd#doGDCuaP_vX!1w@DFaDDAmQ3sN+Su)v6L zA_};GC_JSZkS0Z11f|JS5EVg%=#!!-_z+%#tjm`3-pn~O_sqHX|K0iS@4NT4X{s4@ zr~Pg-2o3eiLLb3x5DNkTK<}2x7q2%!A_`!}r%-9Tvpn8XOKO|j+IGjz)1dv;+-?xq zWLSW&94RL!LweqLN-W)tjAD&84HWg&p73yYDXl&+LD|x{sCIRGb z0pbM?(;-$$x&g>aRnS(sv2BC_Vab+&44zCvg|dAiEX;y4Qs$}LY;~vzw=$z-86R~( z_IbhLQZxgr)SSV@aw7~f29b`wA+T(CTFh_ z)`?{Ei)D5l?P_`ac)RkpB(|$XyLTaxj+^cxs_>nu)*S4<#kRD$SJ&TbS(IUE{D|nVwJ(Whh$gUf>w27@l{dP*0i4X;dJ>sx7v6OXH*JRHH%qtx{b_J zgU=6_jL*!wYP}Y?pB@mrm^2FSHte_&Q>biJo0a6BS9<2cp}yGkoc--p%&wRb{Zp>I zS5J-2Ti7a*hwM$ph5G$zvvX~4_a!!t>E~DeIJ_g6GpN?HSv8aRrcim_@F}CI`e?m{ z6LGKN$vt2G6>4v%`}pz)nCkW*E)KugWK@9m`H%MM2WXMYm3Y2lPE^4M-Qt@Jo$i37 zCnIJHnREGd@#k6c>g-R=2$jpW&dL49neyWseoBg+I+DgvxitTIHYiz6^I?{`U&S>? zfhUgdSl(86+=+23U$23*_3pUm`B|(_MG>#}qY-(ilJP$4$90psWwG7b9woB1lXJTg zdQz(@b@qfDZm%>=eMXk6)+0II;)&00pA%*7Z%j;{)wrs@MzYnM@*KLZ+envg(y_d^iHrLvO z2=96x%rSm@EsS5qbXh{M;n~1Q_468hp(@Fp9!g+m3|m#~^;ZpxGWaEAu&_k%PtqM9 zhYy&t!Y87*?g+u#;*~|?6NF>m|J)ZNLC{%_ z-QL@GEZkb)%~_WbG_iIv2ta+3R+Pbdb=8wOsDA@{>#10;;$TmJs`CuJ!G{$i~G(o&`u*m z1}BU$({Dyj?$U~LFDolQ_;wn1)}5usX&APfcQX6G(cLBL?AoVoAdS9fbb)bR&IrSQnbOa zAUq@k$xy6zB3vSDvou^Z6g&VxT1<#Eu*VtzP~S382n_b|4EFK&VTbrENu`2t5lH)J z&G;HFFPHsB$=S{M+)v6)JQx!HbD6( zv6?yM+s^$h1FA}J-zEr#r|qE)08MC#bSo>11y>YTl`E167BeLd=d4%g{k5D7D?m>U zE#5WBMUwmamOSAYN3q7cC5EdUlp!YrB`oNHe8k{E;fFMAqHGynhq delta 2132 zcmZvddpwkB8^@o=Fb!tL44%gflbmLZnP$vr9TUbWWRb(v`-W!P9Lphx(pH8^X{{PO zs0t z$Mfex%FvZ1&|A6LD8Tu8?lPKx{A|~zxdSg*?(Ro)pITQA&bTcB z2q2?^&@|MjcLJ*55(w$swoQk*>nBUt`~#uS0~synB-&Cx#_YQ6X}m`L$TQ>f*4s@~ zAOKnqMK6c=xkniwA7QE+>#)6t>>(`K5Z+4Sd8E`DI$4W?MtCFjm2$k zMxS2_R7JnQ;p<9KCL&Tj`)0zgLbdz-fZ{9Pc%gfL;DO;sY>PJGg_9nzOC3p#kxdIJ zW$mcF**Sbg$G@Oi{Q6W1Bex_~ptgnZ1|3sa^zxglPf6}>h`rS7>ZF0jx7$wcoJ!x@ zCaQZQ2`-4MwKMlO-g}RJr)c<(-BD{EJs5eHxu46@KPd|>;)Th~8<^N#RmEks&4Fe` zl+=QW4L@d+3RCKG}#G%`!LeZ zXB>-+a$l7~?mS$>RA$*ACO$%yVM#}SIs8LeF!}Pxp@E%L0N}ZPtSXj1yfw){=t?~z zP|JbSo^t$Oy=yKeWF=Em9V|ZZOE}+WnZ>Sqk5C4-XC~#LO|Q?4D$iwE_e}h$$2myc z$9E9~%~^)@`g%%DFJI5RyTZ`$+FZOkgV+D5y=_IhU0&qLaF>c@j6JGjObySZW5(}N zvK=E@D&q{PeDUi0dtNL zr-A@MAPE`AbXBVZP8fjF!hG@>F$Ap=^A}@!Uow^$DV%34#{eE6B7T~|3eK*%LWP9I zoEZ_Q27qpCFCI!`j$TpjOmoe>d4QPLXX2@8VKiqPb9&@PX&rj4dGzD4t>qom8xtc1n+4VROpS!CS%ja{^=%`^{_U8JKNR%}1HBvVGSd>}?z>o5J{mQuW*FpUnz<1z+9DYj zzjm&P76jM#dS(ds|M4KYKI+AErG0nI*|q`crjJK(x0zkF3DsIwmyf4zEp6C&B>YaI zVSRANMd$9`0+YB=23zDU-t%*$7rn&#;ac@jTgSBY4`MV5&50buHl}X}Sc4j9j>m~B zI#{QiG?f|@3WwJw#5<+qNlt0$Vl)mb9}xl+bd(Vy*ucp){`JdE2vsxv|b|<^>1cMVqf5(w%r+^7b?}InJ|+mJ{_>TN0qFPW+{+ zJZ8taOP#U&OoX8NP)VPSOsP6)um8}O|0b8Iwp|_khc{2PCx7t84HVzDHo7waZZ zfpIJcKB$6wK?=YaW)Pg)2}WtV&YYL_nT2KaSV{w(%M||>Rf~#UsCH2qu2Q0PS|I5- zMWGuwmN_3psX76%{z?Xbk`G7#;6-i33zNQ!oNwa=2;T>3k1XwxZX5|&i8oZ^Lj(v7 zp($2ANI0H~N@G|dhwyqhKAZ=_H6ZPTjlFzFir1GPa~Ho%P2R7Z5*3B)6J8g?hXdho zTC3eT1C8H6&wwSWMvd-*?7 z)ue>5u*8kY30vo{=nHdM@USB5?JB4Y1<(e$ltpfQ-CHL7$pwPpLLTG*(5AjwobWaK z$V@ojRer+2yk2U(NdNz?AtBnj_!@=$Cuc?ZE?WwciziupVHO=Dzxw$_`WyiOEFXk0 wd~ZIjVkHHc!;|TMn`dtY&_R#{%9G#$g)H9%G6LYbT`-@9fxEfq2#^@{Ul5T0H~;_u diff --git a/tools/datasets/_metadata/tags_npm.parquet b/tools/datasets/_metadata/tags_npm.parquet index acd04f2c79bb6936ef8776b2a066caa0c99d7515..dac952f9fa86a2de165343eb808b376195022c29 100644 GIT binary patch delta 815 zcmZvaUr19?9LIm>?Eaa5rkn0}xHm7DQn$5oZIPp3h#{;nDVYQYJvbIL?T@3xhdM%x zhzcRb@iC#K(HleCRQx!U{5c3aLiDK{&KV(iNC`M&tt$<&d2s=HXh9Xwy=&Z%sH+68foI>aT~*3OB`12iKVfJhWQN?@W76sy8j zCOMKyuluL$3$fo-$HX!x?*sKrtgFUBN6&Y-vZIIpbBV#{(*a7y} zs>+5sNZNye!FAc_RH+%KCSAR?Y0=>_>t^fc4 delta 776 zcmZvZUr3Wt7{=dozJJ@?bZdJL-?vRHp|5S^+9XGXg(CeSx~SzvAVexyS*{TxaVm(> zMMMPabg30G(F^IKKSPAV!kdC3qHYWdZ&V7j3!#gc^nGi_pj{kzIQ)3t=Y8JEz(k*s3Vv*{iTS~Y%dQn9+oEi{!rR~a%*iUJnn31-5Im=PDK+J-tJv9( zz-|RlXGI_G0drOjSJhhMP{9~`6hOvSeRu%O6=yYG;|dmDwB4t-*!Zlw3jeQm@lXxb zzqje{WqPANEM7@^t*ZSVwq%1-e%tn|@M@tQm4+MaB?+ TypeIs[Extension]: - return suffix in {".csv", ".json", ".tsv", ".arrow"} + return suffix in {".csv", ".json", ".tsv", ".arrow", ".parquet"} def _is_str(obj: Any) -> TypeIs[str]: From 95582df0847c84c61b41a349887a4a1b703477cb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:35:45 +0000 Subject: [PATCH 107/137] feat: Always use `pl.read_csv(try_parse_dates=True)` Related https://github.com/vega/altair/pull/3631#issuecomment-2480670438 --- altair/datasets/_readers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index cd9ef157f..54edb909e 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -288,9 +288,9 @@ def __init__(self, name: _Polars, /) -> None: if not TYPE_CHECKING: pl = self._import(self._name) self._read_fn = { - ".csv": pl.read_csv, + ".csv": partial(pl.read_csv, try_parse_dates=True), ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t"), + ".tsv": partial(pl.read_csv, separator="\t", try_parse_dates=True), ".arrow": pl.read_ipc, ".parquet": pl.read_parquet, } @@ -305,9 +305,11 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: pl = self._import(_pl) pa = self._import(_pa) # noqa: F841 self._read_fn = { - ".csv": partial(pl.read_csv, use_pyarrow=True), + ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True), ".json": pl.read_json, - ".tsv": partial(pl.read_csv, separator="\t", use_pyarrow=True), + ".tsv": partial( + pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True + ), ".arrow": partial(pl.read_ipc, use_pyarrow=True), ".parquet": partial(pl.read_parquet, use_pyarrow=True), } From dc4a23013d39b88b2047c8408b902081a30aec96 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 16 Nov 2024 21:46:07 +0000 Subject: [PATCH 108/137] feat: Adds `_pl_read_json_roundtrip` First mentioned in https://github.com/vega/altair/pull/3631#issuecomment-2480670438 Addresses most of the `polars` part of https://github.com/vega/altair/pull/3631#issuecomment-2479333070 --- altair/datasets/_readers.py | 36 ++++++++++++++++++++++++++++++-- tests/test_datasets.py | 41 +++++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 11 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 54edb909e..e55d28359 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -38,6 +38,7 @@ if TYPE_CHECKING: import json # noqa: F401 import sys + from io import IOBase from urllib.request import OpenerDirector import pandas as pd @@ -282,6 +283,37 @@ def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: self._scan_fn = {".parquet": partial(pd.read_parquet, dtype_backend=_pa)} +def _pl_read_json_roundtrip(source: Path | IOBase, /, **kwds: Any) -> pl.DataFrame: + """ + Try to utilize better date parsing available in `pl.read_csv`_. + + `pl.read_json`_ has few options when compared to `pl.read_csv`_. + + Chaining the two together - *where possible* - is still usually faster than `pandas.read_json`_. + + .. _pl.read_json: + https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html + .. _pl.read_csv: + https://docs.pola.rs/api/python/stable/reference/api/polars.read_csv.html + .. _pandas.read_json: + https://pandas.pydata.org/docs/reference/api/pandas.read_json.html + """ + from io import BytesIO + + import polars as pl + + df = pl.read_json(source, **kwds) + if any(tp.is_nested() for tp in df.schema.dtypes()): + # NOTE: Inferred as `(Geo|Topo)JSON`, which wouldn't be supported by `read_csv` + return df + buf = BytesIO() + df.write_csv(buf) + if kwds: + SHARED_KWDS = {"schema", "schema_overrides", "infer_schema_length"} + kwds = {k: v for k, v in kwds.items() if k in SHARED_KWDS} + return pl.read_csv(buf, try_parse_dates=True, **kwds) + + class _PolarsReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): def __init__(self, name: _Polars, /) -> None: self._name = _requirements(name) @@ -289,7 +321,7 @@ def __init__(self, name: _Polars, /) -> None: pl = self._import(self._name) self._read_fn = { ".csv": partial(pl.read_csv, try_parse_dates=True), - ".json": pl.read_json, + ".json": _pl_read_json_roundtrip, ".tsv": partial(pl.read_csv, separator="\t", try_parse_dates=True), ".arrow": pl.read_ipc, ".parquet": pl.read_parquet, @@ -306,7 +338,7 @@ def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: pa = self._import(_pa) # noqa: F841 self._read_fn = { ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True), - ".json": pl.read_json, + ".json": _pl_read_json_roundtrip, ".tsv": partial( pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True ), diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e325147b2..221666c35 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import datetime as dt import re import sys from functools import partial @@ -35,6 +36,15 @@ CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" +class DatasetSpec(TypedDict, total=False): + """Exceptional cases which cannot rely on defaults.""" + + name: Dataset + suffix: Extension + tag: Version + marks: MarksType + + requires_pyarrow: pytest.MarkDecorator = skip_requires_pyarrow() backends: pytest.MarkDecorator = pytest.mark.parametrize( @@ -346,7 +356,7 @@ def test_reader_cache( @pytest.mark.parametrize( - "dataset", + "name", [ "cars", movies_fail, @@ -361,7 +371,7 @@ def test_reader_cache( @pytest.mark.parametrize("fallback", ["polars", None]) @skip_requires_pyarrow def test_pyarrow_read_json( - fallback: _Polars | None, dataset: Dataset, monkeypatch: pytest.MonkeyPatch + fallback: _Polars | None, name: Dataset, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.setenv(CACHE_ENV_VAR, "") monkeypatch.delitem(sys.modules, "pandas", raising=False) @@ -370,15 +380,28 @@ def test_pyarrow_read_json( data = Loader.with_backend("pyarrow") - data(dataset, ".json") + data(name, ".json") -class DatasetSpec(TypedDict, total=False): - """Exceptional cases which cannot rely on defaults.""" - - suffix: Extension - tag: Version - marks: MarksType +@pytest.mark.parametrize( + ("spec", "column"), + [ + (DatasetSpec(name="cars", tag="v2.11.0"), "Year"), + (DatasetSpec(name="unemployment-across-industries", tag="v2.11.0"), "date"), + (DatasetSpec(name="flights-10k", tag="v2.11.0"), "date"), + (DatasetSpec(name="football", tag="v2.11.0"), "date"), + (DatasetSpec(name="crimea", tag="v2.11.0"), "date"), + (DatasetSpec(name="ohlc", tag="v2.11.0"), "date"), + ], +) +def test_polars_read_json_roundtrip( + polars_loader: Loader[pl.DataFrame, pl.LazyFrame], + spec: DatasetSpec, + column: str, +) -> None: + frame = polars_loader(spec["name"], ".json", tag=spec["tag"]) + tp = frame.schema.to_python()[column] + assert tp is dt.date or issubclass(tp, dt.date) def _dataset_params(overrides: Mapping[Dataset, DatasetSpec]) -> Iterator[ParameterSet]: From 7ddb2a8c1e8ec6477cfc646c385e0b168f2fd330 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 17 Nov 2024 19:28:43 +0000 Subject: [PATCH 109/137] feat(DRAFT): Adds infer-based `altair.datasets.load` Requested by @joelostblom in: https://github.com/vega/altair/pull/3631#issuecomment-2480832609 https://github.com/vega/altair/pull/3631#issuecomment-2479333070 --- altair/datasets/__init__.py | 35 +++++++++++++++--------- altair/datasets/_readers.py | 32 +++++++++++++++++++++- tests/test_datasets.py | 54 +++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 13 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 3760a4f2a..4545d36b0 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -23,7 +23,7 @@ from altair.datasets._readers import _Backend from altair.datasets._typing import Dataset, Extension, Version -__all__ = ["Loader", "data"] +__all__ = ["Loader", "load"] class Loader(Generic[IntoDataFrameT, IntoFrameT]): @@ -320,18 +320,29 @@ def __repr__(self) -> str: return f"{type(self).__name__}[{self._reader._name}]" +load: Loader[Any, Any] + + def __getattr__(name): - if name == "data": - global data - data = Loader.with_backend("pandas") - from altair.utils.deprecation import deprecated_warn - - deprecated_warn( - "Added only for backwards compatibility with `altair-viz/vega_datasets`.", - version="5.5.0", - alternative="altair.datasets.Loader.with_backend(...)", + if name == "load": + import warnings + + from altair.datasets._readers import infer_backend + + reader = infer_backend() + global load + load = Loader.__new__(Loader) + load._reader = reader + + warnings.warn( + "For full IDE completions, instead use:\n\n" + " from altair.datasets import Loader\n" + " load = Loader.with_backend(...)\n\n" + "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609", + UserWarning, stacklevel=3, ) - return data + return load else: - raise AttributeError(name) + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index e55d28359..953401bae 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -11,7 +11,7 @@ import os import urllib.request -from collections.abc import Mapping, Sequence +from collections.abc import Iterable, Mapping, Sequence from functools import partial from importlib import import_module from importlib.util import find_spec @@ -475,6 +475,36 @@ def is_ext_scan(suffix: Any) -> TypeIs[_ExtensionScan]: return suffix == ".parquet" +def is_available(pkg_names: str | Iterable[str], *more_pkg_names: str) -> bool: + pkgs_names = pkg_names if not isinstance(pkg_names, str) else (pkg_names,) + names = chain(pkgs_names, more_pkg_names) + return all(find_spec(name) is not None for name in names) + + +def infer_backend( + *, priority: Sequence[_Backend] = ("polars", "pandas[pyarrow]", "pandas", "pyarrow") +) -> _Reader[Any, Any]: + """ + Return the first available reader in order of `priority`. + + Notes + ----- + - ``"polars"``: can natively load every dataset (including ``(Geo|Topo)JSON``) + - ``"pandas[pyarrow]"``: can load *most* datasets, guarantees ``.parquet`` support + - ``"pandas"``: supports ``.parquet``, if `fastparquet`_ is installed + - ``"pyarrow"``: least reliable + + .. _fastparquet: + https://github.com/dask/fastparquet + + """ + it = (backend(name) for name in priority if is_available(_requirements(name))) + if reader := next(it, None): + return reader + msg = f"Found no supported backend, searched:\n" f"{priority!r}" + raise NotImplementedError(msg) + + @overload def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 221666c35..f903d500a 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -3,7 +3,9 @@ import datetime as dt import re import sys +import warnings from functools import partial +from importlib import import_module from importlib.util import find_spec from typing import TYPE_CHECKING, Any, cast, get_args from urllib.error import URLError @@ -127,6 +129,58 @@ def test_loader_url(backend: _Backend) -> None: assert pattern.match(url) is not None +def test_load(monkeypatch: pytest.MonkeyPatch) -> None: + """ + Inferring the best backend available. + + Based on the following order: + + priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow" + """ + import altair.datasets + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + from altair.datasets import load + + assert load._reader._name == "polars" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "polars", None) + + from altair.datasets import load + + if find_spec("pyarrow") is None: + # NOTE: We can end the test early for the CI job that removes `pyarrow` + assert load._reader._name == "pandas" + monkeypatch.delattr(altair.datasets, "load") + monkeypatch.setitem(sys.modules, "pandas", None) + with pytest.raises(NotImplementedError, match="no.+backend"): + from altair.datasets import load + else: + assert load._reader._name == "pandas[pyarrow]" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "pyarrow", None) + + from altair.datasets import load + + assert load._reader._name == "pandas" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "pandas", None) + monkeypatch.delitem(sys.modules, "pyarrow") + monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow")) + from altair.datasets import load + + assert load._reader._name == "pyarrow" + monkeypatch.delattr(altair.datasets, "load") + monkeypatch.setitem(sys.modules, "pyarrow", None) + + with pytest.raises(NotImplementedError, match="no.+backend"): + from altair.datasets import load + + @backends def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv(CACHE_ENV_VAR, raising=False) From 9544d9b68e1e6c1786d823cdd9ef3e961497cfa3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 18 Nov 2024 21:39:24 +0000 Subject: [PATCH 110/137] refactor: Rename `Loader.with_backend` -> `Loader.from_backend` https://github.com/vega/altair/pull/3631#discussion_r1847157544 --- altair/datasets/__init__.py | 28 ++++++++++++++-------------- tests/test_datasets.py | 24 ++++++++++++------------ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 4545d36b0..d01ef6f60 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -34,7 +34,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") >>> data # doctest: +SKIP Loader[polars] @@ -46,24 +46,24 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): @overload @classmethod - def with_backend( + def from_backend( cls, backend_name: Literal["polars", "polars[pyarrow]"], / ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... @overload @classmethod - def with_backend( + def from_backend( cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / ) -> Loader[pd.DataFrame, pd.DataFrame]: ... @overload @classmethod - def with_backend( + def from_backend( cls, backend_name: Literal["pyarrow"], / ) -> Loader[pa.Table, pa.Table]: ... @classmethod - def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: + def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: """ Initialize a new loader, with the specified backend. @@ -94,7 +94,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") cars = data("cars") >>> type(cars) # doctest: +SKIP @@ -102,7 +102,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: Using ``pandas``: - data = Loader.with_backend("pandas") + data = Loader.from_backend("pandas") cars = data("cars") >>> type(cars) # doctest: +SKIP @@ -110,7 +110,7 @@ def with_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: Using ``pandas``, backed by ``pyarrow`` dtypes: - data = Loader.with_backend("pandas[pyarrow]") + data = Loader.from_backend("pandas[pyarrow]") cars = data("cars", tag="v1.29.0") >>> type(cars) # doctest: +SKIP @@ -170,7 +170,7 @@ def __call__( from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") source = data("stocks", tag="v2.10.0") >>> source.columns # doctest: +SKIP @@ -198,7 +198,7 @@ def __call__( Using ``pandas``: - data = Loader.with_backend("pandas") + data = Loader.from_backend("pandas") source = data("stocks", tag="v2.10.0") >>> source.columns # doctest: +SKIP @@ -222,7 +222,7 @@ def __call__( Using ``pyarrow``: - data = Loader.with_backend("pyarrow") + data = Loader.from_backend("pyarrow") source = data("stocks", tag="v2.10.0") >>> source.column_names # doctest: +SKIP @@ -276,7 +276,7 @@ def url( import altair as alt from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' @@ -302,7 +302,7 @@ def cache_dir(self) -> Path | None: from altair.datasets import Loader - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") data.cache_dir = Path.home() / ".altair_cache" >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP @@ -337,7 +337,7 @@ def __getattr__(name): warnings.warn( "For full IDE completions, instead use:\n\n" " from altair.datasets import Loader\n" - " load = Loader.with_backend(...)\n\n" + " load = Loader.from_backend(...)\n\n" "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609", UserWarning, stacklevel=3, diff --git a/tests/test_datasets.py b/tests/test_datasets.py index f903d500a..0d2deae7f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -86,7 +86,7 @@ class DatasetSpec(TypedDict, total=False): def polars_loader( tmp_path_factory: pytest.TempPathFactory, ) -> Loader[pl.DataFrame, pl.LazyFrame]: - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars") return data @@ -112,14 +112,14 @@ def metadata_columns() -> frozenset[str]: @backends -def test_loader_with_backend(backend: _Backend) -> None: - data = Loader.with_backend(backend) +def test_loader_from_backend(backend: _Backend) -> None: + data = Loader.from_backend(backend) assert data._reader._name == backend @backends def test_loader_url(backend: _Backend) -> None: - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) dataset_name = "volcano" pattern = re.compile( rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+" @@ -185,7 +185,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv(CACHE_ENV_VAR, raising=False) - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) frame = data("stocks", ".csv") assert is_into_dataframe(frame) nw_frame = nw.from_native(frame) @@ -208,7 +208,7 @@ def test_missing_dependency_single( flags=re.DOTALL, ), ): - Loader.with_backend(backend) + Loader.from_backend(backend) @pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"]) @@ -227,7 +227,7 @@ def test_missing_dependency_multi( flags=re.DOTALL, ), ): - Loader.with_backend(backend) + Loader.from_backend(backend) @backends @@ -239,7 +239,7 @@ def test_dataset_not_found(backend: _Backend) -> None: """ import polars as pl - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) real_name: Literal["disasters"] = "disasters" real_suffix: Literal[".csv"] = ".csv" real_tag: Literal["v1.14.0"] = "v1.14.0" @@ -344,7 +344,7 @@ def test_reader_cache( monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) cache_dir = data.cache_dir assert cache_dir is not None assert cache_dir == tmp_path @@ -432,7 +432,7 @@ def test_pyarrow_read_json( if fallback is None: monkeypatch.setitem(sys.modules, "polars", None) - data = Loader.with_backend("pyarrow") + data = Loader.from_backend("pyarrow") data(name, ".json") @@ -497,7 +497,7 @@ def _raise_exception(e: type[Exception], *args: Any, **kwds: Any): def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: from polars.testing import assert_frame_equal - data = Loader.with_backend("polars") + data = Loader.from_backend("polars") data.cache_dir = tmp_path data("londonCentroids") @@ -536,7 +536,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - @backends def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) -> None: """Ensure all backends will query the same column names.""" - data = Loader.with_backend(backend) + data = Loader.from_backend(backend) fn = data._reader.scan_fn(_METADATA) native = fn(_METADATA) schema_columns = nw.from_native(native).lazy().collect().columns From 7b3a89e5b5374eb391b7ae73ace219327069f979 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 18 Nov 2024 21:52:47 +0000 Subject: [PATCH 111/137] feat(DRAFT): Add optional `backend` parameter for `load(...)` Requested by @jonmmease https://github.com/vega/altair/pull/3631#discussion_r1847111064 https://github.com/vega/altair/pull/3631#discussion_r1847176465 --- altair/datasets/__init__.py | 94 +++++++++++++++++++++++++++++++------ tests/test_datasets.py | 81 ++++++++++++++++++++------------ 2 files changed, 132 insertions(+), 43 deletions(-) diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index d01ef6f60..26fd39b20 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, overload +from typing import TYPE_CHECKING, Generic, final, overload from narwhals.typing import IntoDataFrameT, IntoFrameT @@ -320,28 +320,94 @@ def __repr__(self) -> str: return f"{type(self).__name__}[{self._reader._name}]" -load: Loader[Any, Any] +@final +class _Load(Loader[IntoDataFrameT, IntoFrameT]): + @overload + def __call__( # pyright: ignore[reportOverlappingOverload] + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: None = ..., + **kwds: Any, + ) -> IntoDataFrameT: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["polars", "polars[pyarrow]"] = ..., + **kwds: Any, + ) -> pl.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pandas", "pandas[pyarrow]"] = ..., + **kwds: Any, + ) -> pd.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pyarrow"] = ..., + **kwds: Any, + ) -> pa.Table: ... + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + backend: _Backend | None = None, + **kwds: Any, + ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: + if backend is None: + return super().__call__(name, suffix, tag, **kwds) + else: + return self.from_backend(backend)(name, suffix, tag=tag, **kwds) + + +load: _Load[Any, Any] +""" +For full IDE completions, instead use: + + from altair.datasets import Loader + load = Loader.from_backend("polars") + cars = load("cars") + movies = load("movies") + +Alternatively, specify ``backend`` during a call: + + from altair.datasets import load + cars = load("cars", backend="polars") + movies = load("movies", backend="polars") + +Related +------- +- https://github.com/vega/altair/pull/3631#issuecomment-2480832609 +- https://github.com/vega/altair/pull/3631#discussion_r1847111064 +- https://github.com/vega/altair/pull/3631#discussion_r1847176465 +""" def __getattr__(name): if name == "load": - import warnings - from altair.datasets._readers import infer_backend reader = infer_backend() global load - load = Loader.__new__(Loader) + load = _Load.__new__(_Load) load._reader = reader - - warnings.warn( - "For full IDE completions, instead use:\n\n" - " from altair.datasets import Loader\n" - " load = Loader.from_backend(...)\n\n" - "Related: https://github.com/vega/altair/pull/3631#issuecomment-2480832609", - UserWarning, - stacklevel=3, - ) return load else: msg = f"module {__name__!r} has no attribute {name!r}" diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 0d2deae7f..3d986ec75 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -3,7 +3,6 @@ import datetime as dt import re import sys -import warnings from functools import partial from importlib import import_module from importlib.util import find_spec @@ -11,7 +10,12 @@ from urllib.error import URLError import pytest -from narwhals.dependencies import is_into_dataframe, is_polars_dataframe +from narwhals.dependencies import ( + is_into_dataframe, + is_pandas_dataframe, + is_polars_dataframe, + is_pyarrow_table, +) from narwhals.stable import v1 as nw from altair.datasets import Loader @@ -138,47 +142,66 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow" """ import altair.datasets + from altair.datasets import load - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning) - from altair.datasets import load + assert load._reader._name == "polars" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "polars", None) - assert load._reader._name == "polars" + from altair.datasets import load + + if find_spec("pyarrow") is None: + # NOTE: We can end the test early for the CI job that removes `pyarrow` + assert load._reader._name == "pandas" + monkeypatch.delattr(altair.datasets, "load") + monkeypatch.setitem(sys.modules, "pandas", None) + with pytest.raises(NotImplementedError, match="no.+backend"): + from altair.datasets import load + else: + assert load._reader._name == "pandas[pyarrow]" monkeypatch.delattr(altair.datasets, "load") - monkeypatch.setitem(sys.modules, "polars", None) + monkeypatch.setitem(sys.modules, "pyarrow", None) from altair.datasets import load - if find_spec("pyarrow") is None: - # NOTE: We can end the test early for the CI job that removes `pyarrow` - assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") - monkeypatch.setitem(sys.modules, "pandas", None) - with pytest.raises(NotImplementedError, match="no.+backend"): - from altair.datasets import load - else: - assert load._reader._name == "pandas[pyarrow]" - monkeypatch.delattr(altair.datasets, "load") + assert load._reader._name == "pandas" + monkeypatch.delattr(altair.datasets, "load") + + monkeypatch.setitem(sys.modules, "pandas", None) + monkeypatch.delitem(sys.modules, "pyarrow") + monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow")) + from altair.datasets import load - monkeypatch.setitem(sys.modules, "pyarrow", None) + assert load._reader._name == "pyarrow" + monkeypatch.delattr(altair.datasets, "load") + monkeypatch.setitem(sys.modules, "pyarrow", None) + with pytest.raises(NotImplementedError, match="no.+backend"): from altair.datasets import load - assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") - monkeypatch.setitem(sys.modules, "pandas", None) - monkeypatch.delitem(sys.modules, "pyarrow") - monkeypatch.setitem(sys.modules, "pyarrow", import_module("pyarrow")) - from altair.datasets import load +@requires_pyarrow +def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: + import altair.datasets + + monkeypatch.delattr(altair.datasets, "load", raising=False) + + load = altair.datasets.load + assert load._reader._name == "polars" - assert load._reader._name == "pyarrow" - monkeypatch.delattr(altair.datasets, "load") - monkeypatch.setitem(sys.modules, "pyarrow", None) + default = load("cars") + df_pyarrow = load("cars", backend="pyarrow") + df_pandas = load("cars", backend="pandas[pyarrow]") + default_2 = load("cars") + df_polars = load("cars", backend="polars") - with pytest.raises(NotImplementedError, match="no.+backend"): - from altair.datasets import load + assert is_polars_dataframe(default) + assert is_pyarrow_table(df_pyarrow) + assert is_pandas_dataframe(df_pandas) + assert is_polars_dataframe(default_2) + assert is_polars_dataframe(df_polars) @backends From c835c131282cc189b9bc4cc91bef2492c0b2dd36 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 13:25:27 +0000 Subject: [PATCH 112/137] feat(DRAFT): Adds `altair.datasets.url` A dataframe package is still required currently,. Can later be adapted to fit the requirements of (https://github.com/vega/altair/pull/3631#discussion_r1846662053). Related: - https://github.com/vega/altair/pull/3631#issuecomment-2484826592 - https://github.com/vega/altair/pull/3631#issuecomment-2480832711 - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 @mattijn, @joelostblom --- altair/datasets/__init__.py | 415 ++++-------------------------------- altair/datasets/_loader.py | 394 ++++++++++++++++++++++++++++++++++ tests/test_datasets.py | 59 ++++- 3 files changed, 491 insertions(+), 377 deletions(-) create mode 100644 altair/datasets/_loader.py diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index 26fd39b20..ac7ac9f06 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -1,380 +1,23 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, final, overload +from typing import TYPE_CHECKING -from narwhals.typing import IntoDataFrameT, IntoFrameT - -from altair.datasets._readers import _Reader, backend +from altair.datasets._loader import Loader if TYPE_CHECKING: import sys - from pathlib import Path - from typing import Any, Literal - - import pandas as pd - import polars as pl - import pyarrow as pa - from _typeshed import StrPath + from typing import Any if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - from altair.datasets._readers import _Backend - from altair.datasets._typing import Dataset, Extension, Version - -__all__ = ["Loader", "load"] - - -class Loader(Generic[IntoDataFrameT, IntoFrameT]): - """ - Load examples **remotely** from `vega-datasets`_, with *optional* caching. - - A new ``Loader`` must be initialized by specifying a backend: - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - >>> data # doctest: +SKIP - Loader[polars] - - .. _vega-datasets: - https://github.com/vega/vega-datasets - """ - - _reader: _Reader[IntoDataFrameT, IntoFrameT] - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["polars", "polars[pyarrow]"], / - ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / - ) -> Loader[pd.DataFrame, pd.DataFrame]: ... - - @overload - @classmethod - def from_backend( - cls, backend_name: Literal["pyarrow"], / - ) -> Loader[pa.Table, pa.Table]: ... - - @classmethod - def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: - """ - Initialize a new loader, with the specified backend. - - Parameters - ---------- - backend_name - DataFrame package/config used to return data. - - * *polars*: Using `polars defaults`_ - * *polars[pyarrow]*: Using ``use_pyarrow=True`` - * *pandas*: Using `pandas defaults`_. - * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` - * *pyarrow*: (*Experimental*) - - .. warning:: - Most datasets use a `JSON format not supported`_ by ``pyarrow`` - - .. _polars defaults: - https://docs.pola.rs/api/python/stable/reference/io.html - .. _pandas defaults: - https://pandas.pydata.org/docs/reference/io.html - .. _JSON format not supported: - https://arrow.apache.org/docs/python/json.html#reading-json-files - - Examples - -------- - Using ``polars``: - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - cars = data("cars") - - >>> type(cars) # doctest: +SKIP - polars.dataframe.frame.DataFrame - - Using ``pandas``: - - data = Loader.from_backend("pandas") - cars = data("cars") - - >>> type(cars) # doctest: +SKIP - pandas.core.frame.DataFrame - - Using ``pandas``, backed by ``pyarrow`` dtypes: - - data = Loader.from_backend("pandas[pyarrow]") - cars = data("cars", tag="v1.29.0") - - >>> type(cars) # doctest: +SKIP - pandas.core.frame.DataFrame - - >>> cars.dtypes # doctest: +SKIP - Name string[pyarrow] - Miles_per_Gallon double[pyarrow] - Cylinders int64[pyarrow] - Displacement double[pyarrow] - Horsepower int64[pyarrow] - Weight_in_lbs int64[pyarrow] - Acceleration double[pyarrow] - Year string[pyarrow] - Origin string[pyarrow] - dtype: object - """ - obj = Loader.__new__(Loader) - obj._reader = backend(backend_name) - return obj - - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - **kwds: Any, - ) -> IntoDataFrameT: - """ - Get a remote dataset and load as tabular data. - - Parameters - ---------- - name - Name of the dataset/`Path.stem`_. - suffix - File extension/`Path.suffix`_. - - .. note:: - Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. - **kwds - Arguments passed to the underlying read function. - - .. _Path.stem: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem - .. _Path.suffix: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases - - Examples - -------- - Using ``polars``: - - from altair.datasets import Loader - data = Loader.from_backend("polars") - source = data("stocks", tag="v2.10.0") - - >>> source.columns # doctest: +SKIP - ['symbol', 'date', 'price'] - - >>> source # doctest: +SKIP - shape: (560, 3) - ┌────────┬────────────┬────────┐ - │ symbol ┆ date ┆ price │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞════════╪════════════╪════════╡ - │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ - │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ - │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ - │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ - │ MSFT ┆ May 1 2000 ┆ 25.45 │ - │ … ┆ … ┆ … │ - │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ - │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ - │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ - │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ - │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ - └────────┴────────────┴────────┘ - - Using ``pandas``: - - data = Loader.from_backend("pandas") - source = data("stocks", tag="v2.10.0") - - >>> source.columns # doctest: +SKIP - Index(['symbol', 'date', 'price'], dtype='object') - - >>> source # doctest: +SKIP - symbol date price - 0 MSFT Jan 1 2000 39.81 - 1 MSFT Feb 1 2000 36.35 - 2 MSFT Mar 1 2000 43.22 - 3 MSFT Apr 1 2000 28.37 - 4 MSFT May 1 2000 25.45 - .. ... ... ... - 555 AAPL Nov 1 2009 199.91 - 556 AAPL Dec 1 2009 210.73 - 557 AAPL Jan 1 2010 192.06 - 558 AAPL Feb 1 2010 204.62 - 559 AAPL Mar 1 2010 223.02 - - [560 rows x 3 columns] - - Using ``pyarrow``: - - data = Loader.from_backend("pyarrow") - source = data("stocks", tag="v2.10.0") - - >>> source.column_names # doctest: +SKIP - ['symbol', 'date', 'price'] - - >>> source # doctest: +SKIP - pyarrow.Table - symbol: string - date: string - price: double - ---- - symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] - date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] - price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] - """ - return self._reader.dataset(name, suffix, tag=tag, **kwds) - - def url( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - ) -> str: - """ - Return the address of a remote dataset. - - Parameters - ---------- - name - Name of the dataset/`Path.stem`_. - suffix - File extension/`Path.suffix`_. - - .. note:: - Only needed if ``name`` is available in multiple formats. - tag - Version identifier for a `vega-datasets release`_. - - .. _Path.stem: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem - .. _Path.suffix: - https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix - .. _vega-datasets release: - https://github.com/vega/vega-datasets/releases - - Examples - -------- - The returned url will always point to an accessible dataset: - - import altair as alt - from altair.datasets import Loader - - data = Loader.from_backend("polars") - >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP - 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' - - We can pass the result directly to a chart: - - url = data.url("cars", tag="v2.9.0") - alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") - """ - return self._reader.url(name, suffix, tag=tag) - - @property - def cache_dir(self) -> Path | None: - """ - Returns path to datasets cache. - - By default, this can be configured using the environment variable: - - "ALTAIR_DATASETS_DIR" - - You *may* also set this directly, but the value will **not** persist between sessions: - - from pathlib import Path - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - data.cache_dir = Path.home() / ".altair_cache" - - >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP - '.altair_cache' - """ - return self._reader._cache - - @cache_dir.setter - def cache_dir(self, source: StrPath, /) -> None: - import os - - os.environ[self._reader._ENV_VAR] = str(source) - - def __repr__(self) -> str: - return f"{type(self).__name__}[{self._reader._name}]" + from altair.datasets._loader import _Load + from altair.datasets._typing import Dataset, Extension, Version -@final -class _Load(Loader[IntoDataFrameT, IntoFrameT]): - @overload - def __call__( # pyright: ignore[reportOverlappingOverload] - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: None = ..., - **kwds: Any, - ) -> IntoDataFrameT: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["polars", "polars[pyarrow]"] = ..., - **kwds: Any, - ) -> pl.DataFrame: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["pandas", "pandas[pyarrow]"] = ..., - **kwds: Any, - ) -> pd.DataFrame: ... - @overload - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = ..., - /, - tag: Version | None = ..., - backend: Literal["pyarrow"] = ..., - **kwds: Any, - ) -> pa.Table: ... - def __call__( - self, - name: Dataset | LiteralString, - suffix: Extension | None = None, - /, - tag: Version | None = None, - backend: _Backend | None = None, - **kwds: Any, - ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: - if backend is None: - return super().__call__(name, suffix, tag, **kwds) - else: - return self.from_backend(backend)(name, suffix, tag=tag, **kwds) +__all__ = ["Loader", "load", "url"] load: _Load[Any, Any] @@ -400,14 +43,50 @@ def __call__( """ +def url( + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, +) -> str: + """ + Return the address of a remote dataset. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Related + ------- + - https://github.com/vega/altair/pull/3631#issuecomment-2484826592 + - https://github.com/vega/altair/pull/3631#issuecomment-2480832711 + - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 + - https://github.com/vega/altair/pull/3631#discussion_r1846662053 + """ + from altair.datasets._loader import load + + return load.url(name, suffix, tag=tag) + + def __getattr__(name): if name == "load": - from altair.datasets._readers import infer_backend + from altair.datasets._loader import load - reader = infer_backend() - global load - load = _Load.__new__(_Load) - load._reader = reader return load else: msg = f"module {__name__!r} has no attribute {name!r}" diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py new file mode 100644 index 000000000..3c2a0ee21 --- /dev/null +++ b/altair/datasets/_loader.py @@ -0,0 +1,394 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Generic, final, overload + +from narwhals.typing import IntoDataFrameT, IntoFrameT + +from altair.datasets._readers import _Reader, backend + +if TYPE_CHECKING: + import sys + from pathlib import Path + from typing import Any, Literal + + import pandas as pd + import polars as pl + import pyarrow as pa + from _typeshed import StrPath + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from altair.datasets._readers import _Backend + from altair.datasets._typing import Dataset, Extension, Version + +__all__ = ["Loader", "load"] + + +class Loader(Generic[IntoDataFrameT, IntoFrameT]): + """ + Load examples **remotely** from `vega-datasets`_, with *optional* caching. + + A new ``Loader`` must be initialized by specifying a backend: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + >>> data # doctest: +SKIP + Loader[polars] + + .. _vega-datasets: + https://github.com/vega/vega-datasets + """ + + _reader: _Reader[IntoDataFrameT, IntoFrameT] + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["polars", "polars[pyarrow]"], / + ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["pandas", "pandas[pyarrow]"], / + ) -> Loader[pd.DataFrame, pd.DataFrame]: ... + + @overload + @classmethod + def from_backend( + cls, backend_name: Literal["pyarrow"], / + ) -> Loader[pa.Table, pa.Table]: ... + + @classmethod + def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: + """ + Initialize a new loader, with the specified backend. + + Parameters + ---------- + backend_name + DataFrame package/config used to return data. + + * *polars*: Using `polars defaults`_ + * *polars[pyarrow]*: Using ``use_pyarrow=True`` + * *pandas*: Using `pandas defaults`_. + * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` + * *pyarrow*: (*Experimental*) + + .. warning:: + Most datasets use a `JSON format not supported`_ by ``pyarrow`` + + .. _polars defaults: + https://docs.pola.rs/api/python/stable/reference/io.html + .. _pandas defaults: + https://pandas.pydata.org/docs/reference/io.html + .. _JSON format not supported: + https://arrow.apache.org/docs/python/json.html#reading-json-files + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + cars = data("cars") + + >>> type(cars) # doctest: +SKIP + polars.dataframe.frame.DataFrame + + Using ``pandas``: + + data = Loader.from_backend("pandas") + cars = data("cars") + + >>> type(cars) # doctest: +SKIP + pandas.core.frame.DataFrame + + Using ``pandas``, backed by ``pyarrow`` dtypes: + + data = Loader.from_backend("pandas[pyarrow]") + cars = data("cars", tag="v1.29.0") + + >>> type(cars) # doctest: +SKIP + pandas.core.frame.DataFrame + + >>> cars.dtypes # doctest: +SKIP + Name string[pyarrow] + Miles_per_Gallon double[pyarrow] + Cylinders int64[pyarrow] + Displacement double[pyarrow] + Horsepower int64[pyarrow] + Weight_in_lbs int64[pyarrow] + Acceleration double[pyarrow] + Year string[pyarrow] + Origin string[pyarrow] + dtype: object + """ + obj = Loader.__new__(Loader) + obj._reader = backend(backend_name) + return obj + + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + **kwds: Any, + ) -> IntoDataFrameT: + """ + Get a remote dataset and load as tabular data. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + **kwds + Arguments passed to the underlying read function. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Examples + -------- + Using ``polars``: + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + shape: (560, 3) + ┌────────┬────────────┬────────┐ + │ symbol ┆ date ┆ price │ + │ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ f64 │ + ╞════════╪════════════╪════════╡ + │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ + │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ + │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ + │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ + │ MSFT ┆ May 1 2000 ┆ 25.45 │ + │ … ┆ … ┆ … │ + │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ + │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ + │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ + │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ + │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ + └────────┴────────────┴────────┘ + + Using ``pandas``: + + data = Loader.from_backend("pandas") + source = data("stocks", tag="v2.10.0") + + >>> source.columns # doctest: +SKIP + Index(['symbol', 'date', 'price'], dtype='object') + + >>> source # doctest: +SKIP + symbol date price + 0 MSFT Jan 1 2000 39.81 + 1 MSFT Feb 1 2000 36.35 + 2 MSFT Mar 1 2000 43.22 + 3 MSFT Apr 1 2000 28.37 + 4 MSFT May 1 2000 25.45 + .. ... ... ... + 555 AAPL Nov 1 2009 199.91 + 556 AAPL Dec 1 2009 210.73 + 557 AAPL Jan 1 2010 192.06 + 558 AAPL Feb 1 2010 204.62 + 559 AAPL Mar 1 2010 223.02 + + [560 rows x 3 columns] + + Using ``pyarrow``: + + data = Loader.from_backend("pyarrow") + source = data("stocks", tag="v2.10.0") + + >>> source.column_names # doctest: +SKIP + ['symbol', 'date', 'price'] + + >>> source # doctest: +SKIP + pyarrow.Table + symbol: string + date: string + price: double + ---- + symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] + date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] + price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] + """ + return self._reader.dataset(name, suffix, tag=tag, **kwds) + + def url( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + ) -> str: + """ + Return the address of a remote dataset. + + Parameters + ---------- + name + Name of the dataset/`Path.stem`_. + suffix + File extension/`Path.suffix`_. + + .. note:: + Only needed if ``name`` is available in multiple formats. + tag + Version identifier for a `vega-datasets release`_. + + .. _Path.stem: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.stem + .. _Path.suffix: + https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.suffix + .. _vega-datasets release: + https://github.com/vega/vega-datasets/releases + + Examples + -------- + The returned url will always point to an accessible dataset: + + import altair as alt + from altair.datasets import Loader + + data = Loader.from_backend("polars") + >>> data.url("cars", tag="v2.9.0") # doctest: +SKIP + 'https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/data/cars.json' + + We can pass the result directly to a chart: + + url = data.url("cars", tag="v2.9.0") + alt.Chart(url).mark_point().encode(x="Horsepower:Q", y="Miles_per_Gallon:Q") + """ + return self._reader.url(name, suffix, tag=tag) + + @property + def cache_dir(self) -> Path | None: + """ + Returns path to datasets cache. + + By default, this can be configured using the environment variable: + + "ALTAIR_DATASETS_DIR" + + You *may* also set this directly, but the value will **not** persist between sessions: + + from pathlib import Path + + from altair.datasets import Loader + + data = Loader.from_backend("polars") + data.cache_dir = Path.home() / ".altair_cache" + + >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP + '.altair_cache' + """ + return self._reader._cache + + @cache_dir.setter + def cache_dir(self, source: StrPath, /) -> None: + import os + + os.environ[self._reader._ENV_VAR] = str(source) + + def __repr__(self) -> str: + return f"{type(self).__name__}[{self._reader._name}]" + + +@final +class _Load(Loader[IntoDataFrameT, IntoFrameT]): + @overload + def __call__( # pyright: ignore[reportOverlappingOverload] + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: None = ..., + **kwds: Any, + ) -> IntoDataFrameT: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["polars", "polars[pyarrow]"] = ..., + **kwds: Any, + ) -> pl.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pandas", "pandas[pyarrow]"] = ..., + **kwds: Any, + ) -> pd.DataFrame: ... + @overload + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = ..., + /, + tag: Version | None = ..., + backend: Literal["pyarrow"] = ..., + **kwds: Any, + ) -> pa.Table: ... + def __call__( + self, + name: Dataset | LiteralString, + suffix: Extension | None = None, + /, + tag: Version | None = None, + backend: _Backend | None = None, + **kwds: Any, + ) -> IntoDataFrameT | pl.DataFrame | pd.DataFrame | pa.Table: + if backend is None: + return super().__call__(name, suffix, tag, **kwds) + else: + return self.from_backend(backend)(name, suffix, tag=tag, **kwds) + + +load: _Load[Any, Any] + + +def __getattr__(name): + if name == "load": + from altair.datasets._readers import infer_backend + + reader = infer_backend() + global load + load = _Load.__new__(_Load) + load._reader = reader + return load + else: + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 3d986ec75..6de691ff2 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -141,11 +141,11 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: priority: Sequence[_Backend] = "polars", "pandas[pyarrow]", "pandas", "pyarrow" """ - import altair.datasets + import altair.datasets._loader from altair.datasets import load assert load._reader._name == "polars" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) monkeypatch.setitem(sys.modules, "polars", None) @@ -154,20 +154,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: if find_spec("pyarrow") is None: # NOTE: We can end the test early for the CI job that removes `pyarrow` assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) with pytest.raises(NotImplementedError, match="no.+backend"): from altair.datasets import load else: assert load._reader._name == "pandas[pyarrow]" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) from altair.datasets import load assert load._reader._name == "pandas" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) monkeypatch.delitem(sys.modules, "pyarrow") @@ -175,7 +175,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets import load assert load._reader._name == "pyarrow" - monkeypatch.delattr(altair.datasets, "load") + monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) with pytest.raises(NotImplementedError, match="no.+backend"): @@ -184,11 +184,11 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: @requires_pyarrow def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: - import altair.datasets + import altair.datasets._loader - monkeypatch.delattr(altair.datasets, "load", raising=False) + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) + from altair.datasets import load - load = altair.datasets.load assert load._reader._name == "polars" default = load("cars") @@ -204,6 +204,47 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: assert is_polars_dataframe(df_polars) +@pytest.mark.parametrize( + "name", + [ + "jobs", + "la-riots", + "londonBoroughs", + "londonCentroids", + "londonTubeLines", + "lookup_groups", + "lookup_people", + "miserables", + "monarchs", + "movies", + "normal-2d", + "obesity", + "ohlc", + "penguins", + "platformer-terrain", + "points", + "political-contributions", + "population", + "population_engineers_hurricanes", + "seattle-temps", + "seattle-weather", + "seattle-weather-hourly-normals", + "sf-temps", + "sp500", + "sp500-2000", + "stocks", + "udistrict", + ], +) +def test_url(name: Dataset) -> None: + from altair.datasets import url + + pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+") + result = url(name) + assert isinstance(result, str) + assert pattern.match(result) is not None + + @backends def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv(CACHE_ENV_VAR, raising=False) From 0817ff8503f728a4bc0c8d160abaab311f829fd7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 21:46:22 +0000 Subject: [PATCH 113/137] feat: Support `url(...)` without dependencies https://github.com/vega/altair/pull/3631#discussion_r1846662053, https://github.com/vega/altair/pull/3631#issuecomment-2488621316, https://github.com/vega/altair/pull/3631#issuecomment-2481977891 --- altair/datasets/__init__.py | 13 ++++- altair/datasets/_loader.py | 77 +++++++++++++++++++++++++-- altair/datasets/_metadata/url.csv.gz | Bin 0 -> 855 bytes altair/datasets/_readers.py | 5 +- tests/test_datasets.py | 70 +++++++++++++++++++----- tools/datasets/__init__.py | 23 ++++++++ 6 files changed, 168 insertions(+), 20 deletions(-) create mode 100644 altair/datasets/_metadata/url.csv.gz diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index ac7ac9f06..e426ca467 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -78,9 +78,18 @@ def url( - https://github.com/vega/altair/discussions/3150#discussioncomment-11280516 - https://github.com/vega/altair/pull/3631#discussion_r1846662053 """ - from altair.datasets._loader import load + from altair.datasets._readers import AltairDatasetsError - return load.url(name, suffix, tag=tag) + try: + from altair.datasets._loader import load + + url = load.url(name, suffix, tag=tag) + except AltairDatasetsError: + from altair.datasets._loader import url_cache + + url = url_cache[name] + + return url def __getattr__(name): diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 3c2a0ee21..5d8c1ec8b 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Generic, final, overload +from pathlib import Path +from typing import TYPE_CHECKING, Generic, TypeVar, final, get_args, overload from narwhals.typing import IntoDataFrameT, IntoFrameT @@ -8,8 +9,8 @@ if TYPE_CHECKING: import sys - from pathlib import Path - from typing import Any, Literal + from collections.abc import MutableMapping + from typing import Any, Final, Literal import pandas as pd import polars as pl @@ -23,8 +24,15 @@ from altair.datasets._readers import _Backend from altair.datasets._typing import Dataset, Extension, Version + __all__ = ["Loader", "load"] +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") +_T = TypeVar("_T") + +_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" + class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ @@ -377,6 +385,69 @@ def __call__( return self.from_backend(backend)(name, suffix, tag=tag, **kwds) +class UrlCache(Generic[_KT, _VT]): + """ + `csv`_, `gzip`_ -based, lazy url lookup. + + Operates on a subset of available datasets: + - Only the latest version + - Excludes `.parquet`, which `cannot be read via url`_ + - Name collisions are pre-resolved + - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``) + + .. _csv: + https://docs.python.org/3/library/csv.html + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _cannot be read via url: + https://github.com/vega/vega/issues/3961 + """ + + def __init__( + self, + fp: Path, + /, + *, + columns: tuple[str, str] = ("dataset_name", "url_npm"), + tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], + ) -> None: + self.fp: Path = fp + self.columns: tuple[str, str] = columns + self._mapping: MutableMapping[_KT, _VT] = tp() + + def read(self) -> Any: + import csv + import gzip + + with gzip.open(self.fp, mode="rb") as f: + b_lines = f.readlines() + reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) + header = tuple(next(reader)) + if header != self.columns: + msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}" + raise ValueError(msg) + return dict(reader) + + def __getitem__(self, key: _KT, /) -> _VT: + if url := self.get(key, None): + return url + + from altair.datasets._typing import Dataset + + if key in get_args(Dataset): + msg = f"{key!r} cannot be loaded via url." + raise TypeError(msg) + else: + msg = f"{key!r} does not refer to a known dataset." + raise TypeError(msg) + + def get(self, key: _KT, default: _T) -> _VT | _T: + if not self._mapping: + self._mapping.update(self.read()) + return self._mapping.get(key, default) + + +url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) load: _Load[Any, Any] diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..3580606d7cca77cefee4c5bd2b48134f9fac22d9 GIT binary patch literal 855 zcmV-d1E~BTiwFn-B0gsV|8;U~E@N|c0Ik?dQ`;~Q0N^{n1vw!zOlNxPwAUVc&&ZNu zi^y7452x_!tCyXSP&%W{2?*?`_3GgjoO3H3_r`tQY#7(w zi{nDc*>+m^P5g_^ECxz=iFM!RUHA0VZ8zzIO$zRe9v-N)2CR3@(gJkM%@0)TKov1o zFhp|ilo$y*!j8ez3xrvK!u8ZD@!E`)@JdO`owxER+G}`W zEtvIEBdio&C`N62QYpAHupLh2qjt z=LMpUtB{|STRBTTv}+~4Bqyl#OjpT<8rlR6jb?__SlKys|TI+ysb(2Ji^3oO1m3l7I%_CtIcgP|{!TI~FZ z5nzH6z(o`Ca%eSoP~I-#4E#o3^u+CDCVsCkDHGIC#d&IkW>6RrX~Y| zRj;Hh`SzhdXFnSGUPBezJa4zDciy(MD{&TaSaCeCBciT3JWC;7FH^hF-VLupS%yK! z7D>VD6yKbLG7HYdW|IepyP<#1-VS}2fjXZmq-8pJFh~EHsEMX~-qgJq{nRH8>u8dgNo+|G_y6nVA$Qi?D~SmD^hzYb99B`0-a+w4v7E$To`#? hPGA)$PnhL%CQ6!b(lFqNy}1B6!M}FHK(GfF008u|rd9v| literal 0 HcmV?d00001 diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 953401bae..e93fb55e1 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -83,6 +83,9 @@ _METADATA: Final[Path] = Path(__file__).parent / "_metadata" / "metadata.parquet" +class AltairDatasetsError(Exception): ... + + class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): """ Describes basic IO for remote & local tabular resources. @@ -502,7 +505,7 @@ def infer_backend( if reader := next(it, None): return reader msg = f"Found no supported backend, searched:\n" f"{priority!r}" - raise NotImplementedError(msg) + raise AltairDatasetsError(msg) @overload diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 6de691ff2..e5d1f1d3f 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,5 +1,6 @@ from __future__ import annotations +import contextlib import datetime as dt import re import sys @@ -18,8 +19,8 @@ ) from narwhals.stable import v1 as nw -from altair.datasets import Loader -from altair.datasets._readers import _METADATA +from altair.datasets import Loader, url +from altair.datasets._readers import _METADATA, AltairDatasetsError from altair.datasets._typing import Dataset, Extension, Metadata, Version from tests import skip_requires_pyarrow, slow @@ -115,6 +116,13 @@ def metadata_columns() -> frozenset[str]: ) +def match_url(name: Dataset, url: str) -> bool: + return ( + re.match(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+", url) + is not None + ) + + @backends def test_loader_from_backend(backend: _Backend) -> None: data = Loader.from_backend(backend) @@ -124,13 +132,8 @@ def test_loader_from_backend(backend: _Backend) -> None: @backends def test_loader_url(backend: _Backend) -> None: data = Loader.from_backend(backend) - dataset_name = "volcano" - pattern = re.compile( - rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{dataset_name}\..+" - ) - url = data.url(dataset_name) - assert isinstance(url, str) - assert pattern.match(url) is not None + dataset_name: Dataset = "volcano" + assert match_url(dataset_name, data.url(dataset_name)) def test_load(monkeypatch: pytest.MonkeyPatch) -> None: @@ -178,7 +181,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pyarrow", None) - with pytest.raises(NotImplementedError, match="no.+backend"): + with pytest.raises(AltairDatasetsError, match="no.+backend"): from altair.datasets import load @@ -239,10 +242,49 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: def test_url(name: Dataset) -> None: from altair.datasets import url - pattern = re.compile(rf".+jsdelivr\.net/npm/vega-datasets@.+/data/{name}\..+") - result = url(name) - assert isinstance(result, str) - assert pattern.match(result) is not None + assert match_url(name, url(name)) + + +def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: + import altair.datasets + from altair.datasets._loader import url_cache + + monkeypatch.setitem(sys.modules, "polars", None) + monkeypatch.setitem(sys.modules, "pandas", None) + monkeypatch.setitem(sys.modules, "pyarrow", None) + + assert url_cache._mapping == {} + + with contextlib.suppress(AltairDatasetsError): + monkeypatch.delattr(altair.datasets._loader, "load", raising=False) + with pytest.raises(AltairDatasetsError): + from altair.datasets import load as load + + assert match_url("jobs", url("jobs")) + + assert url_cache._mapping != {} + + assert match_url("cars", url("cars")) + assert match_url("stocks", url("stocks")) + assert match_url("countries", url("countries")) + assert match_url("crimea", url("crimea")) + assert match_url("disasters", url("disasters")) + assert match_url("driving", url("driving")) + assert match_url("earthquakes", url("earthquakes")) + assert match_url("flare", url("flare")) + assert match_url("flights-10k", url("flights-10k")) + assert match_url("flights-200k", url("flights-200k")) + + with pytest.raises(TypeError, match="cannot be loaded via url"): + url("climate") + + with pytest.raises(TypeError, match="cannot be loaded via url"): + url("flights-3m") + + with pytest.raises( + TypeError, match="'fake data' does not refer to a known dataset" + ): + url("fake data") @backends diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 3702028ac..ae4d0b583 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -15,12 +15,15 @@ from __future__ import annotations +import gzip import json import types +from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, Literal import polars as pl +from polars import col from tools.codemod import ruff from tools.datasets.github import GitHub @@ -107,6 +110,7 @@ def __init__( } ) self._fp_typing: Path = out_fp_typing + self._fp_url: Path = out_dir_altair / "url.csv.gz" @property def github(self) -> GitHub: @@ -135,6 +139,14 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) + npm_urls_min = ( + gh_trees.lazy() + .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") + .filter(col("size") == col("size").min().over("dataset_name")) + .select("dataset_name", "url_npm") + ) + self.write_csv_gzip(npm_urls_min, self._fp_url) + if include_typing: self.generate_typing(self._fp_typing) return gh_trees @@ -159,6 +171,17 @@ def _from_alias(self, name: _PathAlias, /) -> Path: else: return self._paths[name] + def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: + if fp.suffix != ".gz": + fp = fp.with_suffix(".csv.gz") + if not fp.exists(): + fp.touch() + df = frame.lazy().collect() + buf = BytesIO() + with gzip.open(fp, mode="wb") as f: + df.write_csv(buf) + f.write(buf.getbuffer()) + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" if not fp.exists(): From e01fdd727b2bbfa389e995d126506d647d60ea9f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 21:52:32 +0000 Subject: [PATCH 114/137] fix(DRAFT): Don't generate csv on refresh https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631 --- altair/datasets/_metadata/url.csv.gz | Bin 855 -> 855 bytes tools/datasets/__init__.py | 21 +++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz index 3580606d7cca77cefee4c5bd2b48134f9fac22d9..49a227404cc162e9177aee307c297cfffd4869a1 100644 GIT binary patch delta 15 Wcmcc4cAbq)zMF$1Dsm%RATt0aRs=Z! delta 15 Wcmcc4cAbq)zMF%CQE4MvATt0XEd!|l diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index ae4d0b583..398c06f84 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -120,7 +120,9 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: + def refresh( + self, *, include_typing: bool = False, include_csv: bool = False + ) -> pl.DataFrame: """ Update and sync all dataset metadata files. @@ -139,13 +141,16 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) - npm_urls_min = ( - gh_trees.lazy() - .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") - .filter(col("size") == col("size").min().over("dataset_name")) - .select("dataset_name", "url_npm") - ) - self.write_csv_gzip(npm_urls_min, self._fp_url) + if include_csv: + # BUG: Non-deterministic + # https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631 + npm_urls_min = ( + gh_trees.lazy() + .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") + .filter(col("size") == col("size").min().over("dataset_name")) + .select("dataset_name", "url_npm") + ) + self.write_csv_gzip(npm_urls_min, self._fp_url) if include_typing: self.generate_typing(self._fp_typing) From 0c5195e92d428033b311b784b30c69f5ebeac6ee Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 20 Nov 2024 21:57:19 +0000 Subject: [PATCH 115/137] test: Replace rogue `NotImplementedError` https://github.com/vega/altair/actions/runs/11942364658/job/33289235198?pr=3631 --- tests/test_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e5d1f1d3f..a4bbe40c4 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -159,7 +159,7 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: assert load._reader._name == "pandas" monkeypatch.delattr(altair.datasets._loader, "load") monkeypatch.setitem(sys.modules, "pandas", None) - with pytest.raises(NotImplementedError, match="no.+backend"): + with pytest.raises(AltairDatasetsError, match="no.+backend"): from altair.datasets import load else: assert load._reader._name == "pandas[pyarrow]" From 5595d905c29a89d6388b12b46caa016e9cd91d27 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 21 Nov 2024 10:50:12 +0000 Subject: [PATCH 116/137] fix: Omit `.gz` last modification time header Previously was creating a diff on every refresh, since the current time updated. https://docs.python.org/3/library/gzip.html#gzip.GzipFile.mtime https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631 --- altair/datasets/_metadata/url.csv.gz | Bin 855 -> 855 bytes tools/datasets/__init__.py | 23 +++++++++-------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/altair/datasets/_metadata/url.csv.gz b/altair/datasets/_metadata/url.csv.gz index 49a227404cc162e9177aee307c297cfffd4869a1..07cb52ec1c834808609b204ed2ffe0b4cd83f62e 100644 GIT binary patch delta 17 Xcmcc4cAbqwzMF%C0SGp7_%j0lCyxV& delta 17 Ycmcc4cAbqwzMF$1D$*`}BZogT05C-a-~a#s diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 398c06f84..a3690f65f 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -120,9 +120,7 @@ def github(self) -> GitHub: def npm(self) -> Npm: return self._npm - def refresh( - self, *, include_typing: bool = False, include_csv: bool = False - ) -> pl.DataFrame: + def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: """ Update and sync all dataset metadata files. @@ -141,16 +139,13 @@ def refresh( gh_trees = self.github.refresh_trees(gh_tags) self.write_parquet(gh_trees, self._paths["gh_trees"]) - if include_csv: - # BUG: Non-deterministic - # https://github.com/vega/altair/actions/runs/11942284568/job/33288974210?pr=3631 - npm_urls_min = ( - gh_trees.lazy() - .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") - .filter(col("size") == col("size").min().over("dataset_name")) - .select("dataset_name", "url_npm") - ) - self.write_csv_gzip(npm_urls_min, self._fp_url) + npm_urls_min = ( + gh_trees.lazy() + .filter(col("tag") == col("tag").first(), col("suffix") != ".parquet") + .filter(col("size") == col("size").min().over("dataset_name")) + .select("dataset_name", "url_npm") + ) + self.write_csv_gzip(npm_urls_min, self._fp_url) if include_typing: self.generate_typing(self._fp_typing) @@ -183,7 +178,7 @@ def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> Non fp.touch() df = frame.lazy().collect() buf = BytesIO() - with gzip.open(fp, mode="wb") as f: + with gzip.GzipFile(fp, mode="wb", mtime=0) as f: df.write_csv(buf) f.write(buf.getbuffer()) From 9f621519ac4eb84e506632d81e6b794e55eee00c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 21 Nov 2024 11:01:02 +0000 Subject: [PATCH 117/137] docs: Add doc for `Application.write_csv_gzip` --- tools/datasets/__init__.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index a3690f65f..26955e9c0 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -172,6 +172,17 @@ def _from_alias(self, name: _PathAlias, /) -> Path: return self._paths[name] def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: + """ + Write ``frame`` as a `gzip`_ compressed `csv`_ file. + + - *Much smaller* than a regular ``.csv``. + - Still readable using ``stdlib`` modules. + + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _csv: + https://docs.python.org/3/library/csv.html + """ if fp.suffix != ".gz": fp = fp.with_suffix(".csv.gz") if not fp.exists(): From 1bd455206d5898800ae87d7c22cafba05c9c012e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:34:02 +0000 Subject: [PATCH 118/137] revert: Remove `"polars[pyarrow]" backend Partially related to https://github.com/vega/altair/pull/3631#issuecomment-2484826592 After some thought, this backend didn't add support for any unique dependency configs. I've only ever used `use_pyarrow=True` for `pl.DataFrame.write_parquet` to resolve an issue with invalid headers in `"polars<1.0.0;>=0.19.0"` --- altair/datasets/_loader.py | 5 ++--- altair/datasets/_readers.py | 32 +++----------------------------- tests/test_datasets.py | 5 ++--- 3 files changed, 7 insertions(+), 35 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 5d8c1ec8b..3e31aea2e 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -55,7 +55,7 @@ class Loader(Generic[IntoDataFrameT, IntoFrameT]): @overload @classmethod def from_backend( - cls, backend_name: Literal["polars", "polars[pyarrow]"], / + cls, backend_name: Literal["polars"], / ) -> Loader[pl.DataFrame, pl.LazyFrame]: ... @overload @@ -81,7 +81,6 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: DataFrame package/config used to return data. * *polars*: Using `polars defaults`_ - * *polars[pyarrow]*: Using ``use_pyarrow=True`` * *pandas*: Using `pandas defaults`_. * *pandas[pyarrow]*: Using ``dtype_backend="pyarrow"`` * *pyarrow*: (*Experimental*) @@ -347,7 +346,7 @@ def __call__( suffix: Extension | None = ..., /, tag: Version | None = ..., - backend: Literal["polars", "polars[pyarrow]"] = ..., + backend: Literal["polars"] = ..., **kwds: Any, ) -> pl.DataFrame: ... @overload diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index e93fb55e1..f7b8aecf5 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -73,9 +73,8 @@ _Pandas: TypeAlias = Literal["pandas"] _PyArrow: TypeAlias = Literal["pyarrow"] _ConcreteT = TypeVar("_ConcreteT", _Polars, _Pandas, _PyArrow) - _PolarsAny: TypeAlias = Literal[_Polars, "polars[pyarrow]"] _PandasAny: TypeAlias = Literal[_Pandas, "pandas[pyarrow]"] - _Backend: TypeAlias = Literal[_PolarsAny, _PandasAny, _PyArrow] + _Backend: TypeAlias = Literal[_Polars, _PandasAny, _PyArrow] __all__ = ["backend"] @@ -332,25 +331,6 @@ def __init__(self, name: _Polars, /) -> None: self._scan_fn = {".parquet": pl.scan_parquet} -class _PolarsPyArrowReader(_Reader["pl.DataFrame", "pl.LazyFrame"]): - def __init__(self, name: Literal["polars[pyarrow]"], /) -> None: - _pl, _pa = _requirements(name) - self._name = name - if not TYPE_CHECKING: - pl = self._import(_pl) - pa = self._import(_pa) # noqa: F841 - self._read_fn = { - ".csv": partial(pl.read_csv, use_pyarrow=True, try_parse_dates=True), - ".json": _pl_read_json_roundtrip, - ".tsv": partial( - pl.read_csv, separator="\t", use_pyarrow=True, try_parse_dates=True - ), - ".arrow": partial(pl.read_ipc, use_pyarrow=True), - ".parquet": partial(pl.read_parquet, use_pyarrow=True), - } - self._scan_fn = {".parquet": pl.scan_parquet} - - class _PyArrowReader(_Reader["pa.Table", "pa.Table"]): """ Reader backed by `pyarrow.Table`_. @@ -509,7 +489,7 @@ def infer_backend( @overload -def backend(name: _PolarsAny, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... +def backend(name: _Polars, /) -> _Reader[pl.DataFrame, pl.LazyFrame]: ... @overload @@ -524,8 +504,6 @@ def backend(name: _Backend, /) -> _Reader[Any, Any]: """Reader initialization dispatcher.""" if name == "polars": return _PolarsReader(name) - elif name == "polars[pyarrow]": - return _PolarsPyArrowReader(name) elif name == "pandas[pyarrow]": return _PandasPyArrowReader(name) elif name == "pandas": @@ -548,10 +526,6 @@ def _requirements(s: _ConcreteT, /) -> _ConcreteT: ... def _requirements(s: Literal["pandas[pyarrow]"], /) -> tuple[_Pandas, _PyArrow]: ... -@overload -def _requirements(s: Literal["polars[pyarrow]"], /) -> tuple[_Polars, _PyArrow]: ... - - def _requirements(s: _Backend, /): concrete: set[Literal[_Polars, _Pandas, _PyArrow]] = {"polars", "pandas", "pyarrow"} if s in concrete: @@ -560,7 +534,7 @@ def _requirements(s: _Backend, /): from packaging.requirements import Requirement req = Requirement(s) - supports_extras: set[Literal[_Polars, _Pandas]] = {"polars", "pandas"} + supports_extras: set[Literal[_Pandas]] = {"pandas"} if req.name in supports_extras: name = req.name if (extras := req.extras) and extras == {"pyarrow"}: diff --git a/tests/test_datasets.py b/tests/test_datasets.py index a4bbe40c4..e31f7990e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -68,7 +68,6 @@ class DatasetSpec(TypedDict, total=False): ), ), ), - pytest.param("polars[pyarrow]", marks=requires_pyarrow), pytest.param("pandas[pyarrow]", marks=requires_pyarrow), pytest.param("pyarrow", marks=requires_pyarrow), ], @@ -302,7 +301,7 @@ def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None def test_missing_dependency_single( backend: _Backend, monkeypatch: pytest.MonkeyPatch ) -> None: - if backend in {"polars[pyarrow]", "pandas[pyarrow]"}: + if backend == "pandas[pyarrow]": pytest.skip("Testing single dependency backends only") monkeypatch.setitem(sys.modules, backend, None) @@ -317,7 +316,7 @@ def test_missing_dependency_single( Loader.from_backend(backend) -@pytest.mark.parametrize("backend", ["polars[pyarrow]", "pandas[pyarrow]"]) +@pytest.mark.parametrize("backend", ["pandas[pyarrow]"]) @skip_requires_pyarrow def test_missing_dependency_multi( backend: _Backend, monkeypatch: pytest.MonkeyPatch From 11da9c8f584e466a02a021ef8e93b895145fb333 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 21 Nov 2024 12:41:10 +0000 Subject: [PATCH 119/137] test: Add a complex `xfail` for `test_load_call` Doesn't happen in CI, still unclear why the import within `pandas` breaks under these conditions. Have tried multiple combinations of `pytest.MonkeyPatch`, hard imports, but had no luck in fixing the bug --- tests/test_datasets.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index e31f7990e..50ece0a26 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -86,6 +86,19 @@ class DatasetSpec(TypedDict, total=False): """ +@pytest.fixture +def is_flaky_datasets(request: pytest.FixtureRequest) -> bool: + mark_filter = request.config.getoption("-m", None) # pyright: ignore[reportArgumentType] + if mark_filter is None: + return False + elif mark_filter == "": + return True + elif isinstance(mark_filter, str): + return False + else: + raise TypeError(mark_filter) + + @pytest.fixture(scope="session") def polars_loader( tmp_path_factory: pytest.TempPathFactory, @@ -184,6 +197,20 @@ def test_load(monkeypatch: pytest.MonkeyPatch) -> None: from altair.datasets import load +# HACK: Using a fixture to get a command line option +# https://docs.pytest.org/en/stable/example/simple.html#pass-different-values-to-a-test-function-depending-on-command-line-options +@pytest.mark.xfail( + is_flaky_datasets, # type: ignore + reason=( + "'pandas[pyarrow]' seems to break locally when running:\n" + ">>> pytest -p no:randomly -n logical tests -k test_datasets -m ''\n\n" + "Possibly related:\n" + " https://github.com/modin-project/modin/issues/951\n" + " https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L164\n" + " https://github.com/pandas-dev/pandas/blob/1c986d6213904fd7d9acc5622dc91d029d3f1218/pandas/io/parquet.py#L257\n" + ), + raises=AttributeError, +) @requires_pyarrow def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: import altair.datasets._loader From 694ada0ad496ecd0e07f49ff97e0c5c0753a6085 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:46:48 +0000 Subject: [PATCH 120/137] refactor: Renaming/recomposing `_readers.py` The next commits benefit from having functionality decoupled from `_Reader.query`. Mainly, keeping things lazy and not raising a user-facing error --- altair/datasets/_readers.py | 68 +++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index f7b8aecf5..2c8d53820 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -69,6 +69,13 @@ _ExtensionScan: TypeAlias = Literal[".parquet"] _T = TypeVar("_T") + # NOTE: Using a constrained instead of bound `TypeVar` + # error: Incompatible return value type (got "DataFrame[Any] | LazyFrame[Any]", expected "FrameT") [return-value] + # - https://typing.readthedocs.io/en/latest/spec/generics.html#introduction + # - https://typing.readthedocs.io/en/latest/spec/generics.html#type-variables-with-an-upper-bound + # https://github.com/narwhals-dev/narwhals/blob/21b8436567de3631c584ef67632317ad70ae5de0/narwhals/typing.py#L59 + FrameT = TypeVar("FrameT", nw.DataFrame[Any], nw.LazyFrame) + _Polars: TypeAlias = Literal["polars"] _Pandas: TypeAlias = Literal["pandas"] _PyArrow: TypeAlias = Literal["pyarrow"] @@ -111,7 +118,7 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): Used exclusively for ``metadata.parquet``. - Currently ``polars`` backends are the only lazy options. + Currently ``"polars"`` is the only lazy option. """ _name: LiteralString @@ -125,12 +132,10 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: - suffix = validate_suffix(source, is_ext_read) - return self._read_fn[suffix] + return self._read_fn[_extract_suffix(source, is_ext_read)] def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: - suffix = validate_suffix(source, is_ext_scan) - return self._scan_fn[suffix] + return self._scan_fn[_extract_suffix(source, is_ext_scan)] def dataset( self, @@ -140,7 +145,7 @@ def dataset( tag: Version | None = None, **kwds: Any, ) -> IntoDataFrameT: - df = self.query(**validate_constraints(name, suffix, tag)) + df = self.query(**_extract_constraints(name, suffix, tag)) it = islice(df.iter_rows(named=True), 1) result = cast("Metadata", next(it)) url = result["url_npm"] @@ -166,7 +171,7 @@ def url( /, tag: Version | None = None, ) -> str: - frame = self.query(**validate_constraints(name, suffix, tag)) + frame = self.query(**_extract_constraints(name, suffix, tag)) url = nw.to_py_scalar(frame.item(0, "url_npm")) if isinstance(url, str): return url @@ -180,6 +185,8 @@ def query( """ Query multi-version trees metadata. + Applies a filter, erroring out when no results would be returned. + Notes ----- Arguments correspond to those seen in `pl.LazyFrame.filter`_. @@ -187,12 +194,7 @@ def query( .. _pl.LazyFrame.filter: https://docs.pola.rs/api/python/stable/reference/lazyframe/api/polars.LazyFrame.filter.html """ - frame = ( - nw.from_native(self.scan_fn(_METADATA)(_METADATA)) - .filter(_parse_predicates_constraints(predicates, constraints)) - .lazy() - .collect() - ) + frame = self._scan_metadata(*predicates, **constraints).collect() if not frame.is_empty(): return frame else: @@ -200,18 +202,13 @@ def query( msg = f"Found no results for:\n {terms}" raise ValueError(msg) - def _read_metadata(self) -> IntoDataFrameT: - """ - Return the full contents of ``metadata.parquet``. - - Effectively an eager read, no filters. - """ - return ( - nw.from_native(self.scan_fn(_METADATA)(_METADATA)) - .lazy() - .collect() - .to_native() - ) + def _scan_metadata( + self, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] + ) -> nw.LazyFrame: + frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() + if predicates or constraints: + return _filter(frame, *predicates, **constraints) + return frame @property def _cache(self) -> Path | None: # type: ignore[return] @@ -406,24 +403,30 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table: self._scan_fn = {".parquet": pa_read_parquet} -def _parse_predicates_constraints( - predicates: tuple[Any, ...], constraints: Metadata, / -) -> nw.Expr: +def _filter( + frame: FrameT, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] +) -> FrameT: """ ``narwhals`` only accepts ``filter(*predicates)``. So we convert each item in ``**constraints`` here as:: col("column_name") == literal_value + + - https://github.com/narwhals-dev/narwhals/issues/1383 + - https://github.com/narwhals-dev/narwhals/pull/1417 """ - return nw.all_horizontal( - chain(predicates, (nw.col(name) == v for name, v in constraints.items())) + return frame.filter( + nw.all_horizontal( + *chain(predicates, (nw.col(name) == v for name, v in constraints.items())) + ) ) -def validate_constraints( +def _extract_constraints( name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / ) -> Metadata: + """Transform args into a mapping to column names.""" constraints: Metadata = {} if tag is not None: constraints["tag"] = tag @@ -445,7 +448,7 @@ def validate_constraints( return constraints -def validate_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: +def _extract_suffix(source: StrPath, guard: Callable[..., TypeIs[_T]], /) -> _T: suffix: Any = Path(source).suffix if guard(suffix): return suffix @@ -479,7 +482,6 @@ def infer_backend( .. _fastparquet: https://github.com/dask/fastparquet - """ it = (backend(name) for name in priority if is_available(_requirements(name))) if reader := next(it, None): From 6f41c7e5b830bff1e901ecbe1fcec862f72c4683 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:49:46 +0000 Subject: [PATCH 121/137] build: Generate `VERSION_LATEST` Simplifies logic that relies on enum/categoricals that may not be recognised as ordered --- altair/datasets/_typing.py | 10 +++++++++- tools/datasets/__init__.py | 10 ++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index cdaa57322..0b681b834 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -24,6 +24,7 @@ __all__ = [ "EXTENSION_SUFFIXES", + "VERSION_LATEST", "Dataset", "Extension", "Metadata", @@ -154,7 +155,14 @@ "v1.5.0", ] Extension: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow", ".parquet"] -EXTENSION_SUFFIXES = (".csv", ".json", ".tsv", ".arrow", ".parquet") +VERSION_LATEST: Literal["v2.11.0"] = "v2.11.0" +EXTENSION_SUFFIXES: tuple[ + Literal[".csv"], + Literal[".json"], + Literal[".tsv"], + Literal[".arrow"], + Literal[".parquet"], +] = (".csv", ".json", ".tsv", ".arrow", ".parquet") def is_ext_read(suffix: Any) -> TypeIs[Extension]: diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 26955e9c0..1402a9c7b 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -226,9 +226,14 @@ def generate_typing(self, output: Path, /) -> None: indent = " " * 4 NAME = "Dataset" TAG = "Version" + LATEST = "VERSION_LATEST" + LATEST_TAG = f"{tags.first()!r}" EXT = "Extension" EXTENSION_TYPES = ".csv", ".json", ".tsv", ".arrow", ".parquet" EXTENSION_SUFFIXES = "EXTENSION_SUFFIXES" + EXTENSION_TYPE_TP = ( + f"tuple[{', '.join(f'Literal[{el!r}]' for el in EXTENSION_TYPES)}]" + ) EXTENSION_GUARD = "is_ext_read" METADATA_TD = "Metadata" DESCRIPTION_DEFAULT = "_description_" @@ -318,11 +323,12 @@ def generate_typing(self, output: Path, /) -> None: utils.import_typing_extensions((3, 13), "TypeIs"), utils.import_typing_extensions((3, 10), "TypeAlias"), "\n", - f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES]}\n\n" + f"__all__ = {[NAME, TAG, EXT, METADATA_TD, EXTENSION_GUARD, EXTENSION_SUFFIXES, LATEST]}\n\n" f"{NAME}: TypeAlias = {utils.spell_literal(names)}", f"{TAG}: TypeAlias = {utils.spell_literal(tags)}", f"{EXT}: TypeAlias = {utils.spell_literal(EXTENSION_TYPES)}", - f"{EXTENSION_SUFFIXES} = {EXTENSION_TYPES!r}", + f"{LATEST}: Literal[{LATEST_TAG}] = {LATEST_TAG}", + f"{EXTENSION_SUFFIXES}: {EXTENSION_TYPE_TP} = {EXTENSION_TYPES!r}", f"def {EXTENSION_GUARD}(suffix: Any) -> TypeIs[{EXT}]:\n" f"{indent}return suffix in set({EXTENSION_TYPES!r})\n", UNIVERSAL_TYPED_DICT.format( From 88d06a64ac8a21350314b5300fbd7142d57e13cf Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 16:54:16 +0000 Subject: [PATCH 122/137] feat: Adds `_cache.py` for `UrlCache`, `DatasetCache` Docs to follow --- altair/datasets/__init__.py | 2 +- altair/datasets/_cache.py | 226 ++++++++++++++++++++++++++++++++++++ altair/datasets/_loader.py | 110 ++---------------- altair/datasets/_readers.py | 21 +--- tests/test_datasets.py | 75 +++++++++--- 5 files changed, 304 insertions(+), 130 deletions(-) create mode 100644 altair/datasets/_cache.py diff --git a/altair/datasets/__init__.py b/altair/datasets/__init__.py index e426ca467..70d01eacc 100644 --- a/altair/datasets/__init__.py +++ b/altair/datasets/__init__.py @@ -85,7 +85,7 @@ def url( url = load.url(name, suffix, tag=tag) except AltairDatasetsError: - from altair.datasets._loader import url_cache + from altair.datasets._cache import url_cache url = url_cache[name] diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py new file mode 100644 index 000000000..9239911fd --- /dev/null +++ b/altair/datasets/_cache.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args + +import narwhals.stable.v1 as nw +from narwhals.dependencies import get_pyarrow +from narwhals.typing import IntoDataFrameT, IntoFrameT + +from altair.datasets._typing import VERSION_LATEST + +if TYPE_CHECKING: + import sys + from collections.abc import Iterator, MutableMapping + from typing import Any, Final + + from _typeshed import StrPath + + if sys.version_info >= (3, 11): + from typing import LiteralString + else: + from typing_extensions import LiteralString + from altair.datasets._readers import _Reader + from altair.datasets._typing import Dataset + +__all__ = ["DatasetCache", "UrlCache", "url_cache"] + + +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") +_T = TypeVar("_T") + +_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" + + +class UrlCache(Generic[_KT, _VT]): + """ + `csv`_, `gzip`_ -based, lazy url lookup. + + Operates on a subset of available datasets: + - Only the latest version + - Excludes `.parquet`, which `cannot be read via url`_ + - Name collisions are pre-resolved + - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``) + + .. _csv: + https://docs.python.org/3/library/csv.html + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _cannot be read via url: + https://github.com/vega/vega/issues/3961 + """ + + def __init__( + self, + fp: Path, + /, + *, + columns: tuple[str, str] = ("dataset_name", "url_npm"), + tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], + ) -> None: + self.fp: Path = fp + self.columns: tuple[str, str] = columns + self._mapping: MutableMapping[_KT, _VT] = tp() + + def read(self) -> Any: + import csv + import gzip + + with gzip.open(self.fp, mode="rb") as f: + b_lines = f.readlines() + reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) + header = tuple(next(reader)) + if header != self.columns: + msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}" + raise ValueError(msg) + return dict(reader) + + def __getitem__(self, key: _KT, /) -> _VT: + if url := self.get(key, None): + return url + + from altair.datasets._typing import Dataset + + if key in get_args(Dataset): + msg = f"{key!r} cannot be loaded via url." + raise TypeError(msg) + else: + msg = f"{key!r} does not refer to a known dataset." + raise TypeError(msg) + + def get(self, key: _KT, default: _T) -> _VT | _T: + if not self._mapping: + self._mapping.update(self.read()) + return self._mapping.get(key, default) + + +class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): + _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" + + def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None: + self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader + + def download_all(self) -> None: + """ + Download any missing datasets for latest version. + + ``v2.11.0`` stats + ----------------- + - **66** items + - **27.8** MB + - Only 1 file > 2 MB + """ + stems = tuple(fp.stem for fp in self) + latest = nw.col("tag") == nw.lit(VERSION_LATEST) + predicates = (~(nw.col("sha").is_in(stems)), latest) if stems else (latest,) + frame = ( + self._rd._scan_metadata( + *predicates, ext_supported=True, name_collision=False + ) + .select("sha", "suffix", "url_npm") + .unique("sha") + .collect() + ) + if frame.is_empty(): + print("Already downloaded all datasets") + return None + print(f"Downloading {len(frame)} missing datasets...") + for row in frame.iter_rows(named=True): + fp: Path = self.path / (row["sha"] + row["suffix"]) + with self._rd._opener.open(row["url_npm"]) as f: + fp.touch() + fp.write_bytes(f.read()) + print("Finished downloads") + return None + + def clear(self) -> None: + # unlink all matching sha + # stricter than `__iter__` + # - to avoid deleting unrelated files in dir + self.ensure_active() + if self.is_empty(): + return None + ser = ( + self._rd._scan_metadata() + .select("sha", "suffix") + .unique("sha") + .select(nw.concat_str("sha", "suffix").alias("sha_suffix")) + .collect() + .get_column("sha_suffix") + ) + names = set[str]( + ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser + ) + for fp in self: + if fp.name in names: + fp.unlink() + + def __iter__(self) -> Iterator[Path]: + yield from self.path.iterdir() + + def __repr__(self): + name = type(self).__name__ + if self.is_not_active(): + return f"{name}" + else: + return f"{name}<{self.path.as_posix()!r}>" + + def is_active(self) -> bool: + return not self.is_not_active() + + def is_not_active(self) -> bool: + return os.environ.get(self._ENV_VAR) is None + + def is_empty(self) -> bool: + """Cache is active, but no files in the directory.""" + return next(iter(self), None) is None + + def ensure_active(self) -> None: + # Fail fast when the cache op is later + # Otherwise, just get the error from `self.path` + if self.is_not_active(): + msg = ( + f"Cache is unset.\n" + f"To enable dataset caching, set the environment variable:\n" + f" {self._ENV_VAR!r}\n\n" + f"You can set this for the current session via:\n" + f" from pathlib import Path\n" + f" from altair.datasets import load\n\n" + f" load.cache.path = Path.home() / '.altair_cache'" + ) + raise ValueError(msg) + + @property + def path(self) -> Path: + """ + Returns path to datasets cache. + + By default, this can be configured using the environment variable: + + "ALTAIR_DATASETS_DIR" + + You can set this for the current session via: + + >>> from pathlib import Path + >>> from altair.datasets import load + >>> load.cache.path = Path.home() / ".altair_cache" + + >>> load.cache.path.relative_to(Path.home()).as_posix() + '.altair_cache' + """ + self.ensure_active() + fp = Path(os.environ[self._ENV_VAR]) + fp.mkdir(exist_ok=True) + return fp + + @path.setter + def path(self, source: StrPath | None, /) -> None: + if source is not None: + os.environ[self._ENV_VAR] = str(Path(source).resolve()) + else: + os.environ.pop(self._ENV_VAR, None) + + +url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 3e31aea2e..ac56aa892 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -1,7 +1,6 @@ from __future__ import annotations -from pathlib import Path -from typing import TYPE_CHECKING, Generic, TypeVar, final, get_args, overload +from typing import TYPE_CHECKING, Generic, final, overload from narwhals.typing import IntoDataFrameT, IntoFrameT @@ -9,13 +8,13 @@ if TYPE_CHECKING: import sys - from collections.abc import MutableMapping - from typing import Any, Final, Literal + from typing import Any, Literal import pandas as pd import polars as pl import pyarrow as pa - from _typeshed import StrPath + + from altair.datasets._cache import DatasetCache if sys.version_info >= (3, 11): from typing import LiteralString @@ -27,12 +26,6 @@ __all__ = ["Loader", "load"] -_KT = TypeVar("_KT") -_VT = TypeVar("_VT") -_T = TypeVar("_T") - -_URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" - class Loader(Generic[IntoDataFrameT, IntoFrameT]): """ @@ -294,34 +287,18 @@ def url( """ return self._reader.url(name, suffix, tag=tag) + # TODO: Examples for tasklist @property - def cache_dir(self) -> Path | None: + def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: """ - Returns path to datasets cache. - - By default, this can be configured using the environment variable: - - "ALTAIR_DATASETS_DIR" - - You *may* also set this directly, but the value will **not** persist between sessions: - - from pathlib import Path - - from altair.datasets import Loader - - data = Loader.from_backend("polars") - data.cache_dir = Path.home() / ".altair_cache" + Dataset caching. - >>> data.cache_dir.relative_to(Path.home()).as_posix() # doctest: +SKIP - '.altair_cache' + - [x] Enable via 2 examples + - [ ] Disable after enabling (self.cache.path = None) + - [ ] Pre-download missing + - [ ] Clear entire cache """ - return self._reader._cache - - @cache_dir.setter - def cache_dir(self, source: StrPath, /) -> None: - import os - - os.environ[self._reader._ENV_VAR] = str(source) + return self._reader.cache def __repr__(self) -> str: return f"{type(self).__name__}[{self._reader._name}]" @@ -384,69 +361,6 @@ def __call__( return self.from_backend(backend)(name, suffix, tag=tag, **kwds) -class UrlCache(Generic[_KT, _VT]): - """ - `csv`_, `gzip`_ -based, lazy url lookup. - - Operates on a subset of available datasets: - - Only the latest version - - Excludes `.parquet`, which `cannot be read via url`_ - - Name collisions are pre-resolved - - Only provide the smallest (e.g. ``weather.json`` instead of ``weather.csv``) - - .. _csv: - https://docs.python.org/3/library/csv.html - .. _gzip: - https://docs.python.org/3/library/gzip.html - .. _cannot be read via url: - https://github.com/vega/vega/issues/3961 - """ - - def __init__( - self, - fp: Path, - /, - *, - columns: tuple[str, str] = ("dataset_name", "url_npm"), - tp: type[MutableMapping[_KT, _VT]] = dict["_KT", "_VT"], - ) -> None: - self.fp: Path = fp - self.columns: tuple[str, str] = columns - self._mapping: MutableMapping[_KT, _VT] = tp() - - def read(self) -> Any: - import csv - import gzip - - with gzip.open(self.fp, mode="rb") as f: - b_lines = f.readlines() - reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) - header = tuple(next(reader)) - if header != self.columns: - msg = f"Expected header to match {self.columns!r},\n" f"but got: {header!r}" - raise ValueError(msg) - return dict(reader) - - def __getitem__(self, key: _KT, /) -> _VT: - if url := self.get(key, None): - return url - - from altair.datasets._typing import Dataset - - if key in get_args(Dataset): - msg = f"{key!r} cannot be loaded via url." - raise TypeError(msg) - else: - msg = f"{key!r} does not refer to a known dataset." - raise TypeError(msg) - - def get(self, key: _KT, default: _T) -> _VT | _T: - if not self._mapping: - self._mapping.update(self.read()) - return self._mapping.get(key, default) - - -url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) load: _Load[Any, Any] diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 2c8d53820..e7c97b9d1 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -9,7 +9,6 @@ from __future__ import annotations -import os import urllib.request from collections.abc import Iterable, Mapping, Sequence from functools import partial @@ -33,6 +32,7 @@ import narwhals.stable.v1 as nw from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT +from altair.datasets._cache import DatasetCache from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read if TYPE_CHECKING: @@ -128,7 +128,6 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): Otherwise, has no concrete meaning. """ - _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" _opener: ClassVar[OpenerDirector] = urllib.request.build_opener() def read_fn(self, source: StrPath, /) -> Callable[..., IntoDataFrameT]: @@ -151,8 +150,8 @@ def dataset( url = result["url_npm"] fn = self.read_fn(url) - if cache := self._cache: - fp = cache / (result["sha"] + result["suffix"]) + if self.cache.is_active(): + fp = self.cache.path / (result["sha"] + result["suffix"]) if fp.exists() and fp.stat().st_size: return fn(fp, **kwds) else: @@ -211,18 +210,8 @@ def _scan_metadata( return frame @property - def _cache(self) -> Path | None: # type: ignore[return] - """ - Returns path to datasets cache, if possible. - - Requires opt-in via environment variable:: - - Reader._ENV_VAR - """ - if _dir := os.environ.get(self._ENV_VAR): - cache_dir = Path(_dir) - cache_dir.mkdir(exist_ok=True) - return cache_dir + def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: + return DatasetCache(self) def _import(self, name: str, /) -> Any: if spec := find_spec(name): diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 50ece0a26..1d0990abf 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -7,6 +7,7 @@ from functools import partial from importlib import import_module from importlib.util import find_spec +from pathlib import Path from typing import TYPE_CHECKING, Any, cast, get_args from urllib.error import URLError @@ -21,7 +22,7 @@ from altair.datasets import Loader, url from altair.datasets._readers import _METADATA, AltairDatasetsError -from altair.datasets._typing import Dataset, Extension, Metadata, Version +from altair.datasets._typing import Dataset, Extension, Metadata, Version, is_ext_read from tests import skip_requires_pyarrow, slow if sys.version_info >= (3, 14): @@ -104,7 +105,7 @@ def polars_loader( tmp_path_factory: pytest.TempPathFactory, ) -> Loader[pl.DataFrame, pl.LazyFrame]: data = Loader.from_backend("polars") - data.cache_dir = tmp_path_factory.mktemp("loader-cache-polars") + data.cache.path = tmp_path_factory.mktemp("loader-cache-polars") return data @@ -273,7 +274,7 @@ def test_url(name: Dataset) -> None: def test_url_no_backend(monkeypatch: pytest.MonkeyPatch) -> None: import altair.datasets - from altair.datasets._loader import url_cache + from altair.datasets._cache import url_cache monkeypatch.setitem(sys.modules, "polars", None) monkeypatch.setitem(sys.modules, "pandas", None) @@ -477,11 +478,11 @@ def test_reader_cache( monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) data = Loader.from_backend(backend) - cache_dir = data.cache_dir - assert cache_dir is not None + assert data.cache.is_active() + cache_dir = data.cache.path assert cache_dir == tmp_path - assert tuple(cache_dir.iterdir()) == () + assert tuple(data.cache) == () # smallest csvs lookup_groups = data("lookup_groups", tag="v2.5.3") @@ -489,7 +490,7 @@ def test_reader_cache( data("iowa-electricity", tag="v2.3.1") data("global-temp", tag="v2.9.0") - cached_paths = tuple(cache_dir.iterdir()) + cached_paths = tuple(data.cache) assert len(cached_paths) == 4 if is_polars_dataframe(lookup_groups): @@ -504,15 +505,15 @@ def test_reader_cache( ) assert_frame_equal(left, right) - assert len(tuple(cache_dir.iterdir())) == 4 - assert cached_paths == tuple(cache_dir.iterdir()) + assert len(tuple(data.cache)) == 4 + assert cached_paths == tuple(data.cache) data("iowa-electricity", tag="v1.30.2") data("global-temp", tag="v2.8.1") data("global-temp", tag="v2.8.0") - assert len(tuple(cache_dir.iterdir())) == 4 - assert cached_paths == tuple(cache_dir.iterdir()) + assert len(tuple(data.cache)) == 4 + assert cached_paths == tuple(data.cache) data("lookup_people", tag="v1.10.0") data("lookup_people", tag="v1.11.0") @@ -522,8 +523,52 @@ def test_reader_cache( data("lookup_people", tag="v2.3.0") data("lookup_people", tag="v2.5.0-next.0") - assert len(tuple(cache_dir.iterdir())) == 4 - assert cached_paths == tuple(cache_dir.iterdir()) + assert len(tuple(data.cache)) == 4 + assert cached_paths == tuple(data.cache) + + +@slow +@datasets_debug +@backends +def test_reader_cache_exhaustive( + backend: _Backend, monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + """ + Fully populate and then purge the cache for all backends. + + - Does not attempt to read the files + - Checking we can support pre-downloading and safely deleting + """ + monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) + data = Loader.from_backend(backend) + assert data.cache.is_active() + cache_dir = data.cache.path + assert cache_dir == tmp_path + assert tuple(data.cache) == () + + data.cache.download_all() + cached_paths = tuple(data.cache) + assert cached_paths != () + + # NOTE: Approximating all datasets downloaded + assert len(cached_paths) >= 40 + assert all( + bool(fp.exists() and is_ext_read(fp.suffix) and fp.stat().st_size) + for fp in data.cache + ) + # NOTE: Confirm this is a no-op + data.cache.download_all() + assert len(cached_paths) == len(tuple(data.cache)) + + # NOTE: Ensure unrelated files in the directory are not removed + dummy: Path = tmp_path / "dummy.json" + dummy.touch(exist_ok=False) + data.cache.clear() + + remaining = tuple(tmp_path.iterdir()) + assert len(remaining) == 1 + assert remaining[0] == dummy + dummy.unlink() movies_fail: ParameterSet = pytest.param( @@ -559,7 +604,7 @@ def test_reader_cache( def test_pyarrow_read_json( fallback: _Polars | None, name: Dataset, monkeypatch: pytest.MonkeyPatch ) -> None: - monkeypatch.setenv(CACHE_ENV_VAR, "") + monkeypatch.delenv(CACHE_ENV_VAR, raising=False) monkeypatch.delitem(sys.modules, "pandas", raising=False) if fallback is None: monkeypatch.setitem(sys.modules, "polars", None) @@ -630,7 +675,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - from polars.testing import assert_frame_equal data = Loader.from_backend("polars") - data.cache_dir = tmp_path + data.cache.path = tmp_path data("londonCentroids") data("stocks") From f21b52b6c932c517383de02087f75228af0f7a28 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 17:59:09 +0000 Subject: [PATCH 123/137] ci(ruff): Ignore `0.8.0` violations https://github.com/vega/altair/discussions/3687#discussioncomment-11351453 --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c43e00504..e398dfb6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -378,7 +378,9 @@ ignore = [ # doc-line-too-long "W505", # Any as annotation - "ANN401" + "ANN401", + # 0.8.0 + "RUF039", "RUF200" ] # https://docs.astral.sh/ruff/settings/#lintpydocstyle pydocstyle={ convention="numpy" } From e7974d90c78a38c06d7e19aeeb54e32179948022 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 22 Nov 2024 19:53:01 +0000 Subject: [PATCH 124/137] fix: Use stable `narwhals` imports https://github.com/narwhals-dev/narwhals/issues/1426, https://github.com/vega/altair/pull/3693#discussion_r1854513083 --- altair/datasets/_cache.py | 8 +++++--- altair/datasets/_loader.py | 2 +- altair/datasets/_readers.py | 2 +- tests/test_datasets.py | 25 ++++++++++--------------- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 9239911fd..0166c50e8 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args import narwhals.stable.v1 as nw -from narwhals.dependencies import get_pyarrow -from narwhals.typing import IntoDataFrameT, IntoFrameT +from narwhals.stable.v1 import dependencies as nw_dep +from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._typing import VERSION_LATEST @@ -151,7 +151,9 @@ def clear(self) -> None: .get_column("sha_suffix") ) names = set[str]( - ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser + ser.to_list() + if nw.get_native_namespace(ser) is nw_dep.get_pyarrow() + else ser ) for fp in self: if fp.name in names: diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index ac56aa892..5be85e60a 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Generic, final, overload -from narwhals.typing import IntoDataFrameT, IntoFrameT +from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._readers import _Reader, backend diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index e7c97b9d1..5adcf3751 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -30,7 +30,7 @@ ) import narwhals.stable.v1 as nw -from narwhals.typing import IntoDataFrameT, IntoExpr, IntoFrameT +from narwhals.stable.v1.typing import IntoDataFrameT, IntoExpr, IntoFrameT from altair.datasets._cache import DatasetCache from altair.datasets._typing import EXTENSION_SUFFIXES, is_ext_read diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 1d0990abf..20515069b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -12,13 +12,8 @@ from urllib.error import URLError import pytest -from narwhals.dependencies import ( - is_into_dataframe, - is_pandas_dataframe, - is_polars_dataframe, - is_pyarrow_table, -) from narwhals.stable import v1 as nw +from narwhals.stable.v1 import dependencies as nw_dep from altair.datasets import Loader, url from altair.datasets._readers import _METADATA, AltairDatasetsError @@ -227,11 +222,11 @@ def test_load_call(monkeypatch: pytest.MonkeyPatch) -> None: default_2 = load("cars") df_polars = load("cars", backend="polars") - assert is_polars_dataframe(default) - assert is_pyarrow_table(df_pyarrow) - assert is_pandas_dataframe(df_pandas) - assert is_polars_dataframe(default_2) - assert is_polars_dataframe(df_polars) + assert nw_dep.is_polars_dataframe(default) + assert nw_dep.is_pyarrow_table(df_pyarrow) + assert nw_dep.is_pandas_dataframe(df_pandas) + assert nw_dep.is_polars_dataframe(default_2) + assert nw_dep.is_polars_dataframe(df_polars) @pytest.mark.parametrize( @@ -320,7 +315,7 @@ def test_loader_call(backend: _Backend, monkeypatch: pytest.MonkeyPatch) -> None data = Loader.from_backend(backend) frame = data("stocks", ".csv") - assert is_into_dataframe(frame) + assert nw_dep.is_into_dataframe(frame) nw_frame = nw.from_native(frame) assert set(nw_frame.columns) == {"symbol", "date", "price"} @@ -493,7 +488,7 @@ def test_reader_cache( cached_paths = tuple(data.cache) assert len(cached_paths) == 4 - if is_polars_dataframe(lookup_groups): + if nw_dep.is_polars_dataframe(lookup_groups): left, right = ( lookup_groups, cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")), @@ -664,7 +659,7 @@ def test_all_datasets( ) -> None: """Ensure all annotated datasets can be loaded with the most reliable backend.""" frame = polars_loader(name, suffix, tag=tag) - assert is_polars_dataframe(frame) + assert nw_dep.is_polars_dataframe(frame) def _raise_exception(e: type[Exception], *args: Any, **kwds: Any): @@ -698,7 +693,7 @@ def test_no_remote_connection(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) - # Now we can get a cache-hit frame = data("birdstrikes") - assert is_polars_dataframe(frame) + assert nw_dep.is_polars_dataframe(frame) assert len(tuple(tmp_path.iterdir())) == 4 with monkeypatch.context() as mp: From c907dc500504cdff8e2342f488fb679cd2108975 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 13:52:44 +0000 Subject: [PATCH 125/137] revert(ruff): Ignore `0.8.0` violations f21b52b6c932c517383de02087f75228af0f7a28 --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a44b4459e..c353b9b9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -377,9 +377,7 @@ ignore = [ # doc-line-too-long "W505", # Any as annotation - "ANN401", - # 0.8.0 - "RUF039", "RUF200" + "ANN401" ] # https://docs.astral.sh/ruff/settings/#lintpydocstyle pydocstyle={ convention="numpy" } From a3b38c49836c850681c41c797865351bddfccbb7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 13:58:53 +0000 Subject: [PATCH 126/137] revert: Remove `_readers._filter` Feature has been adopted upstream in https://github.com/narwhals-dev/narwhals/pull/1417 --- altair/datasets/_readers.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 5adcf3751..354a45532 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -206,7 +206,7 @@ def _scan_metadata( ) -> nw.LazyFrame: frame = nw.from_native(self.scan_fn(_METADATA)(_METADATA)).lazy() if predicates or constraints: - return _filter(frame, *predicates, **constraints) + return frame.filter(*predicates, **constraints) return frame @property @@ -392,26 +392,6 @@ def pa_read_json(source: Any, /, **kwds) -> pa.Table: self._scan_fn = {".parquet": pa_read_parquet} -def _filter( - frame: FrameT, *predicates: OneOrSeq[IntoExpr], **constraints: Unpack[Metadata] -) -> FrameT: - """ - ``narwhals`` only accepts ``filter(*predicates)``. - - So we convert each item in ``**constraints`` here as:: - - col("column_name") == literal_value - - - https://github.com/narwhals-dev/narwhals/issues/1383 - - https://github.com/narwhals-dev/narwhals/pull/1417 - """ - return frame.filter( - nw.all_horizontal( - *chain(predicates, (nw.col(name) == v for name, v in constraints.items())) - ) - ) - - def _extract_constraints( name: Dataset | LiteralString, suffix: Extension | None, tag: Version | None, / ) -> Metadata: From a6c5096ddab82fd4682006f90158b71b0f3aa479 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 14:43:11 +0000 Subject: [PATCH 127/137] feat: Adds example and tests for disabling caching --- altair/datasets/_cache.py | 4 ++++ altair/datasets/_loader.py | 2 +- tests/test_datasets.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index 0166c50e8..f801a26d1 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -211,6 +211,10 @@ def path(self) -> Path: >>> load.cache.path.relative_to(Path.home()).as_posix() '.altair_cache' + + You can *later* disable caching via: + + >>> load.cache.path = None """ self.ensure_active() fp = Path(os.environ[self._ENV_VAR]) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 5be85e60a..111af950b 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -294,7 +294,7 @@ def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: Dataset caching. - [x] Enable via 2 examples - - [ ] Disable after enabling (self.cache.path = None) + - [x] Disable after enabling (self.cache.path = None) - [ ] Pre-download missing - [ ] Clear entire cache """ diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 20515069b..5d2b93c2d 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -566,6 +566,36 @@ def test_reader_cache_exhaustive( dummy.unlink() +def test_reader_cache_disable(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + from altair.datasets import load + + monkeypatch.setenv(CACHE_ENV_VAR, str(tmp_path)) + + assert load.cache.is_active() + assert load.cache.path == tmp_path + assert load.cache.is_empty() + load("cars") + assert not load.cache.is_empty() + + # RELATED: https://github.com/python/mypy/issues/3004 + load.cache.path = None # type: ignore[assignment] + + assert load.cache.is_not_active() + with pytest.raises( + ValueError, + match=re.compile( + rf"Cache.+unset.+{CACHE_ENV_VAR}.+\.cache\.path =", flags=re.DOTALL + ), + ): + tuple(load.cache) + + load.cache.path = tmp_path + + assert load.cache.is_active() + assert load.cache.path == tmp_path + assert not load.cache.is_empty() + + movies_fail: ParameterSet = pytest.param( "movies", marks=pytest.mark.xfail( From 71423eadfe63a767c2b591f743b3a36272d59c7d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 15:11:41 +0000 Subject: [PATCH 128/137] refactor: Tidy up `DatasetCache` --- altair/datasets/_cache.py | 124 ++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 67 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index f801a26d1..f9e3c683a 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args import narwhals.stable.v1 as nw -from narwhals.stable.v1 import dependencies as nw_dep +from narwhals.stable.v1.dependencies import get_pyarrow from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._typing import VERSION_LATEST @@ -102,22 +102,38 @@ class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None: self._rd: _Reader[IntoDataFrameT, IntoFrameT] = reader + def clear(self) -> None: + """Delete all previously cached datasets.""" + self._ensure_active() + if self.is_empty(): + return None + ser = ( + self._rd._scan_metadata() + .select("sha", "suffix") + .unique("sha") + .select(nw.concat_str("sha", "suffix").alias("sha_suffix")) + .collect() + .get_column("sha_suffix") + ) + names = set[str]( + ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser + ) + for fp in self: + if fp.name in names: + fp.unlink() + def download_all(self) -> None: """ Download any missing datasets for latest version. - ``v2.11.0`` stats - ----------------- - - **66** items - - **27.8** MB - - Only 1 file > 2 MB + Requires **30-50MB** of disk-space. """ stems = tuple(fp.stem for fp in self) latest = nw.col("tag") == nw.lit(VERSION_LATEST) predicates = (~(nw.col("sha").is_in(stems)), latest) if stems else (latest,) frame = ( self._rd._scan_metadata( - *predicates, ext_supported=True, name_collision=False + predicates, ext_supported=True, name_collision=False ) .select("sha", "suffix", "url_npm") .unique("sha") @@ -135,65 +151,6 @@ def download_all(self) -> None: print("Finished downloads") return None - def clear(self) -> None: - # unlink all matching sha - # stricter than `__iter__` - # - to avoid deleting unrelated files in dir - self.ensure_active() - if self.is_empty(): - return None - ser = ( - self._rd._scan_metadata() - .select("sha", "suffix") - .unique("sha") - .select(nw.concat_str("sha", "suffix").alias("sha_suffix")) - .collect() - .get_column("sha_suffix") - ) - names = set[str]( - ser.to_list() - if nw.get_native_namespace(ser) is nw_dep.get_pyarrow() - else ser - ) - for fp in self: - if fp.name in names: - fp.unlink() - - def __iter__(self) -> Iterator[Path]: - yield from self.path.iterdir() - - def __repr__(self): - name = type(self).__name__ - if self.is_not_active(): - return f"{name}" - else: - return f"{name}<{self.path.as_posix()!r}>" - - def is_active(self) -> bool: - return not self.is_not_active() - - def is_not_active(self) -> bool: - return os.environ.get(self._ENV_VAR) is None - - def is_empty(self) -> bool: - """Cache is active, but no files in the directory.""" - return next(iter(self), None) is None - - def ensure_active(self) -> None: - # Fail fast when the cache op is later - # Otherwise, just get the error from `self.path` - if self.is_not_active(): - msg = ( - f"Cache is unset.\n" - f"To enable dataset caching, set the environment variable:\n" - f" {self._ENV_VAR!r}\n\n" - f"You can set this for the current session via:\n" - f" from pathlib import Path\n" - f" from altair.datasets import load\n\n" - f" load.cache.path = Path.home() / '.altair_cache'" - ) - raise ValueError(msg) - @property def path(self) -> Path: """ @@ -216,7 +173,7 @@ def path(self) -> Path: >>> load.cache.path = None """ - self.ensure_active() + self._ensure_active() fp = Path(os.environ[self._ENV_VAR]) fp.mkdir(exist_ok=True) return fp @@ -228,5 +185,38 @@ def path(self, source: StrPath | None, /) -> None: else: os.environ.pop(self._ENV_VAR, None) + def __iter__(self) -> Iterator[Path]: + yield from self.path.iterdir() + + def __repr__(self) -> str: + name = type(self).__name__ + if self.is_not_active(): + return f"{name}" + else: + return f"{name}<{self.path.as_posix()!r}>" + + def is_active(self) -> bool: + return not self.is_not_active() + + def is_not_active(self) -> bool: + return os.environ.get(self._ENV_VAR) is None + + def is_empty(self) -> bool: + """Cache is active, but no files are stored in ``self.path``.""" + return next(iter(self), None) is None + + def _ensure_active(self) -> None: + if self.is_not_active(): + msg = ( + f"Cache is unset.\n" + f"To enable dataset caching, set the environment variable:\n" + f" {self._ENV_VAR!r}\n\n" + f"You can set this for the current session via:\n" + f" from pathlib import Path\n" + f" from altair.datasets import load\n\n" + f" load.cache.path = Path.home() / '.altair_cache'" + ) + raise ValueError(msg) + url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) From 7dd9c18a6eef4c15baa91540ef887c30e38bff04 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 15:25:13 +0000 Subject: [PATCH 129/137] docs: Finish `Loader.cache` Not using doctest style here, none of these return anything but I want them hinted at --- altair/datasets/_cache.py | 2 ++ altair/datasets/_loader.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index f9e3c683a..ce058c561 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -97,6 +97,8 @@ def get(self, key: _KT, default: _T) -> _VT | _T: class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): + """Optional caching of remote dataset requests.""" + _ENV_VAR: ClassVar[LiteralString] = "ALTAIR_DATASETS_DIR" def __init__(self, reader: _Reader[IntoDataFrameT, IntoFrameT], /) -> None: diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index 111af950b..ce2559aed 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -287,16 +287,22 @@ def url( """ return self._reader.url(name, suffix, tag=tag) - # TODO: Examples for tasklist @property def cache(self) -> DatasetCache[IntoDataFrameT, IntoFrameT]: """ - Dataset caching. + Optional caching of remote dataset requests. - - [x] Enable via 2 examples - - [x] Disable after enabling (self.cache.path = None) - - [ ] Pre-download missing - - [ ] Clear entire cache + Enable caching: + + self.cache.path = ... + + Download the latest datasets *ahead-of-time*: + + self.cache.download_all() + + Remove all downloaded datasets: + + self.cache.clear() """ return self._reader.cache From a982759715061c436ea93aea8234cd04dfca4657 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 24 Nov 2024 17:26:20 +0000 Subject: [PATCH 130/137] refactor(typing): Use `Mapping` instead of `dict` Mutability is not needed. Also see https://github.com/vega/altair/pull/3573 --- altair/datasets/_readers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 354a45532..9228c5531 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -105,14 +105,14 @@ class _Reader(Protocol[IntoDataFrameT, IntoFrameT]): _Reader._name """ - _read_fn: dict[Extension, Callable[..., IntoDataFrameT]] + _read_fn: Mapping[Extension, Callable[..., IntoDataFrameT]] """ Eager file read functions. Each corresponds to a known file extension within ``vega-datasets``. """ - _scan_fn: dict[_ExtensionScan, Callable[..., IntoFrameT]] + _scan_fn: Mapping[_ExtensionScan, Callable[..., IntoFrameT]] """ *Optionally*-lazy file read/scan functions. From d20e9c11071898bb3f418fda22bf3f915ff949e8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 30 Nov 2024 14:44:42 +0000 Subject: [PATCH 131/137] perf: Use `to_list()` for all backends https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508957161, https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508928135, https://github.com/narwhals-dev/narwhals/issues/1443#issuecomment-2508981618 --- altair/datasets/_cache.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index ce058c561..edca990d6 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args import narwhals.stable.v1 as nw -from narwhals.stable.v1.dependencies import get_pyarrow from narwhals.stable.v1.typing import IntoDataFrameT, IntoFrameT from altair.datasets._typing import VERSION_LATEST @@ -117,9 +116,7 @@ def clear(self) -> None: .collect() .get_column("sha_suffix") ) - names = set[str]( - ser.to_list() if nw.get_native_namespace(ser) is get_pyarrow() else ser - ) + names = set[str](ser.to_list()) for fp in self: if fp.name in names: fp.unlink() From 909e7d05e57718b2f634a7e6781cb4e58a835837 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 15:38:12 +0000 Subject: [PATCH 132/137] feat(DRAFT): Utilize `datapackage` schemas in `pandas` backends Provides a generalized solution to `pd.read_(csv|json)` requiring the names of date columns to attempt parsing. cc @joelostblom The solution is possible in large part to https://github.com/vega/vega-datasets/pull/631 https://github.com/vega/altair/pull/3631#issuecomment-2480816377 --- altair/datasets/_cache.py | 149 ++++++++++++++++-- .../_metadata/datapackage_schemas.json.gz | Bin 0 -> 2490 bytes altair/datasets/_readers.py | 37 ++++- altair/datasets/_typing.py | 22 +++ tests/test_datasets.py | 68 +++++++- tools/datasets/__init__.py | 85 +++++++--- tools/datasets/datapackage.py | 133 ++++++++++++++++ tools/datasets/models.py | 94 ++++++++++- tools/datasets/npm.py | 54 ++++++- 9 files changed, 600 insertions(+), 42 deletions(-) create mode 100644 altair/datasets/_metadata/datapackage_schemas.json.gz create mode 100644 tools/datasets/datapackage.py diff --git a/altair/datasets/_cache.py b/altair/datasets/_cache.py index edca990d6..22c652bf3 100644 --- a/altair/datasets/_cache.py +++ b/altair/datasets/_cache.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import sys from pathlib import Path from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar, get_args @@ -9,19 +10,32 @@ from altair.datasets._typing import VERSION_LATEST +if sys.version_info >= (3, 12): + from typing import Protocol +else: + from typing_extensions import Protocol + if TYPE_CHECKING: - import sys - from collections.abc import Iterator, MutableMapping + from collections.abc import Iterator, Mapping, MutableMapping + from io import IOBase from typing import Any, Final from _typeshed import StrPath + from narwhals.stable.v1.dtypes import DType if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias from altair.datasets._readers import _Reader - from altair.datasets._typing import Dataset + from altair.datasets._typing import Dataset, FlFieldStr + + _Dataset: TypeAlias = "Dataset | LiteralString" + _FlSchema: TypeAlias = Mapping[str, FlFieldStr] __all__ = ["DatasetCache", "UrlCache", "url_cache"] @@ -31,9 +45,62 @@ _T = TypeVar("_T") _URL: Final[Path] = Path(__file__).parent / "_metadata" / "url.csv.gz" +_SCHEMA: Final[Path] = ( + Path(__file__).parent / "_metadata" / "datapackage_schemas.json.gz" +) + +_FIELD_TO_DTYPE: Mapping[FlFieldStr, type[DType]] = { + "integer": nw.Int64, + "number": nw.Float64, + "boolean": nw.Boolean, + "string": nw.String, + "object": nw.Struct, + "array": nw.List, + "date": nw.Date, + "datetime": nw.Datetime, + # "time": nw.Time, (Not Implemented, but we don't have any cases using it anyway) + "duration": nw.Duration, +} +""" +Similar to an inverted `pl.datatypes.convert.dtype_to_ffiname`_. + +But using the string repr of ``frictionless`` `Field Types`_ to `narwhals.dtypes`_. + +.. _pl.datatypes.convert.dtype_to_ffiname: + https://github.com/pola-rs/polars/blob/85d078c066860e012f5e7e611558e6382b811b82/py-polars/polars/datatypes/convert.py#L139-L165 +.. _Field Types: + https://datapackage.org/standard/table-schema/#field-types +.. _narwhals.dtypes: + https://narwhals-dev.github.io/narwhals/api-reference/dtypes/ +""" + +_DTYPE_TO_FIELD: Mapping[type[DType], FlFieldStr] = { + v: k for k, v in _FIELD_TO_DTYPE.items() +} + + +class CompressedCache(Protocol[_KT, _VT]): + fp: Path + _mapping: MutableMapping[_KT, _VT] + + def read(self) -> Any: ... + def __getitem__(self, key: _KT, /) -> _VT: ... + + def __enter__(self) -> IOBase: + import gzip + + return gzip.open(self.fp, mode="rb").__enter__() + def __exit__(self, *args) -> None: + return -class UrlCache(Generic[_KT, _VT]): + def get(self, key: _KT, default: _T, /) -> _VT | _T: + if not self._mapping: + self._mapping.update(self.read()) + return self._mapping.get(key, default) + + +class UrlCache(CompressedCache[_KT, _VT]): """ `csv`_, `gzip`_ -based, lazy url lookup. @@ -65,9 +132,8 @@ def __init__( def read(self) -> Any: import csv - import gzip - with gzip.open(self.fp, mode="rb") as f: + with self as f: b_lines = f.readlines() reader = csv.reader((bs.decode() for bs in b_lines), dialect=csv.unix_dialect) header = tuple(next(reader)) @@ -89,10 +155,72 @@ def __getitem__(self, key: _KT, /) -> _VT: msg = f"{key!r} does not refer to a known dataset." raise TypeError(msg) - def get(self, key: _KT, default: _T) -> _VT | _T: - if not self._mapping: - self._mapping.update(self.read()) - return self._mapping.get(key, default) + +class SchemaCache(CompressedCache["_Dataset", "_FlSchema"]): + """ + `json`_, `gzip`_ -based, lazy schema lookup. + + - Primarily benefits ``pandas``, which needs some help identifying **temporal** columns. + - Utilizes `data package`_ schema types. + - All methods return falsy containers instead of exceptions + + .. _json: + https://docs.python.org/3/library/json.html + .. _gzip: + https://docs.python.org/3/library/gzip.html + .. _data package: + https://github.com/vega/vega-datasets/pull/631 + """ + + def __init__( + self, + fp: Path, + /, + *, + tp: type[MutableMapping[_Dataset, _FlSchema]] = dict["_Dataset", "_FlSchema"], + ) -> None: + self.fp: Path = fp + self._mapping: MutableMapping[_Dataset, _FlSchema] = tp() + + def read(self) -> Any: + import json + + with self as f: + return json.load(f) + + def __getitem__(self, key: _Dataset, /) -> _FlSchema: + return self.get(key, {}) + + def by_dtype(self, name: _Dataset, *dtypes: type[DType]) -> list[str]: + """ + Return column names specfied in ``name``'s schema. + + Parameters + ---------- + name + Dataset name. + *dtypes + Optionally, only return columns matching the given data type(s). + """ + if (match := self[name]) and dtypes: + include = {_DTYPE_TO_FIELD[tp] for tp in dtypes} + return [col for col, tp_str in match.items() if tp_str in include] + else: + return list(match) + + def schema(self, name: _Dataset, /) -> Mapping[str, DType]: + return { + column: _FIELD_TO_DTYPE[tp_str]() for column, tp_str in self[name].items() + } + + def schema_cast(self, name: _Dataset, /) -> Iterator[nw.Expr]: + """ + Can be passed directly to `.with_columns(...). + + BUG: `cars` doesnt work in either pandas backend + """ + for column, dtype in self.schema(name).items(): + yield nw.col(column).cast(dtype) class DatasetCache(Generic[IntoDataFrameT, IntoFrameT]): @@ -219,3 +347,4 @@ def _ensure_active(self) -> None: url_cache: UrlCache[Dataset | LiteralString, str] = UrlCache(_URL) +schema_cache = SchemaCache(_SCHEMA) diff --git a/altair/datasets/_metadata/datapackage_schemas.json.gz b/altair/datasets/_metadata/datapackage_schemas.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..537dcd28ba9377319523683299cb1773ddf40e79 GIT binary patch literal 2490 zcmV;r2}SlFiwFn+00002|72lwVQ^t%Yhh<)Uvpz$tz$R-L!ItIS||(ffKCc2x9p&f;3z=l4IH%W+WdN6

XuiARkfAhqQ;6aWN zvyuC)aAS!);apf|9XB@6{-~V5biPnJSDd-ZVjh{YnFf{ur8NhZ?3B`f5ScF{DT0u# zaiUk=CCh**!KMW6xB`Po84iu)UpvDar|T}mnBRB|`pK|5T{|X}Cp@|06IZQ8I+AQ= zgm%4(3Sq2E6|0eNNEQWClY;~0+)PgyhFvZ9Do%qlj+~O^x#8AguM}mq(WcTmpJH6s zU@o;Pg==#@Rfu<61oeW~`l&bK%gKXq64CniI{9R@wMf3t+Q@t>avsIPM^}GbC2J-* zM%@{mx0#QhZ4-MA^IKIP5K0e6Pj>A_K`C z7H3EG?DTQM`72y*C(uaL=Cw`WtfSpKzzhBQy6m=N8S%%Xmy z8;)F{0zXFP(OaZqsV&Y2qEt+x){Hw+F+o}Op!vi1IJ>nk+H}cE=a}|zB-hEeQPw3Hd`@^iCp@1Mp3f1_55JQWM22%B!#R;*L1b7E85Tr_ z1(9JvWLOXx7DVq0!t;XgydXR;3C~Ny^OEqqBs?z(&r8DdlJLAFJTD2)OT;rY<);KP zCy+A&c}E~W6Uci4`9L7&1hOCy!t>p^5d3|e7p0>qhCD1kXrH-?3mJ3;XN<>3d#G)L3m3x zjr&9KdtN6qrC$=nLh%1Q-VRN8X>{B4nIrrp?+wNtX%zd9`rnaP02pm}#%v7r#AJ(i zGsEhuCkjae?e-(fBk~jx*+WKK_xN%qw#o5~_1nKU4r#5O1ukY8^OltRB@tN1MC%VqR3yQXut#T*W-p- z=UsBctWAC@F>c+zUX29!Q#8Iu?gfB9X{2L*yLJ92C&0SQ4McZXp2M&xa{m1pSkO0# zl36^Z?}o`)#hG*!4BVNn`yn^#jC=fa)b-y^MLoJfH&Creh}!0T=+0sTWCK~4ADKGm zKYmxd5hnM49|ym<=rch=8xz1}h7}GR?vW=q;5%p=^RXyk&Q31sFtF(&$yq%t5)Oxc z38E%=)wCxLKcrARxee=kyvgdfmr113-Btei2`=q*IYoXJGWV;1WDT9iF;py~!`oDl z1Ub%1TbJBM9ybEQ7kv@Mgwme;4achAIVaZ_>jZ>*RN9e(R%^vfRvp{rgCiJ6-gfYk z-G|0AraTG}N|$(d^mPd@SDY6T2kWl9jeHOCHF^}=uex@Ap|mQPsqu}FK4{xkY7Kos z6J9Fmo1Nw62_nzqp|PO&foA%>b9H!_P9f2!QkNw?07C)D*guxyZ<%Tfmbuo@&+^u= zjpS*N0&jh_`WD(5mh;s4KHJA`|6mAhnRJ3A&26P2lQJ33geQPX-7Lb}$CZN{@-9TZ8unpEIxV7-DKLxjM{KWv-*$aQb!3hq$ z_9^|?S;>WeLF`Xz8+wPKwBe`BV Callable[..., IntoDataFrameT]: def scan_fn(self, source: StrPath, /) -> Callable[..., IntoFrameT]: return self._scan_fn[_extract_suffix(source, is_ext_scan)] + def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: + """Hook to provide additional schema metadata on read.""" + return {} + def dataset( self, name: Dataset | LiteralString, @@ -149,6 +153,8 @@ def dataset( result = cast("Metadata", next(it)) url = result["url_npm"] fn = self.read_fn(url) + if default_kwds := self._schema_kwds(result): + kwds = default_kwds | kwds if kwds else default_kwds if self.cache.is_active(): fp = self.cache.path / (result["sha"] + result["suffix"]) @@ -238,7 +244,32 @@ def __repr__(self) -> str: def __init__(self, name: LiteralString, /) -> None: ... -class _PandasReader(_Reader["pd.DataFrame", "pd.DataFrame"]): +class _PandasReaderBase(_Reader["pd.DataFrame", "pd.DataFrame"], Protocol): + """ + Provides temporal column names as keyword arguments on read. + + Related + ------- + - https://github.com/vega/altair/pull/3631#issuecomment-2480816377 + - https://github.com/vega/vega-datasets/pull/631 + - https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html + - https://pandas.pydata.org/docs/reference/api/pandas.read_json.html + """ + + def _schema_kwds(self, result: Metadata, /) -> dict[str, Any]: + from altair.datasets._cache import schema_cache + + name: Any = result["dataset_name"] + suffix = result["suffix"] + if cols := schema_cache.by_dtype(name, nw.Date, nw.Datetime): + if suffix == ".json": + return {"convert_dates": cols} + elif suffix in {".csv", ".tsv"}: + return {"parse_dates": cols} + return super()._schema_kwds(result) + + +class _PandasReader(_PandasReaderBase): def __init__(self, name: _Pandas, /) -> None: self._name = _requirements(name) if not TYPE_CHECKING: @@ -253,7 +284,7 @@ def __init__(self, name: _Pandas, /) -> None: self._scan_fn = {".parquet": pd.read_parquet} -class _PandasPyArrowReader(_Reader["pd.DataFrame", "pd.DataFrame"]): +class _PandasPyArrowReader(_PandasReaderBase): def __init__(self, name: Literal["pandas[pyarrow]"], /) -> None: _pd, _pa = _requirements(name) self._name = name diff --git a/altair/datasets/_typing.py b/altair/datasets/_typing.py index 0b681b834..c83c6066e 100644 --- a/altair/datasets/_typing.py +++ b/altair/datasets/_typing.py @@ -257,3 +257,25 @@ class Metadata(TypedDict, total=False): suffix: str tag: str url_npm: str + + +FlFieldStr: TypeAlias = Literal[ + "integer", + "number", + "boolean", + "string", + "object", + "array", + "date", + "datetime", + "time", + "duration", +] +""" +String representation of `frictionless`_ `Field Types`_. + +.. _frictionless: + https://github.com/frictionlessdata/frictionless-py +.. _Field Types: + https://datapackage.org/standard/table-schema/#field-types +""" diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 5d2b93c2d..9d91c275e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -30,10 +30,12 @@ from pathlib import Path from typing import Literal + import pandas as pd import polars as pl from _pytest.mark.structures import ParameterSet - from altair.datasets._readers import _Backend, _Polars + from altair.datasets._readers import _Backend, _PandasAny, _Polars + from altair.vegalite.v5.schema._typing import OneOrSeq from tests import MarksType CACHE_ENV_VAR: Literal["ALTAIR_DATASETS_DIR"] = "ALTAIR_DATASETS_DIR" @@ -743,3 +745,67 @@ def test_metadata_columns(backend: _Backend, metadata_columns: frozenset[str]) - native = fn(_METADATA) schema_columns = nw.from_native(native).lazy().collect().columns assert set(schema_columns) == metadata_columns + + +@skip_requires_pyarrow +@pytest.mark.parametrize("backend", ["pandas", "pandas[pyarrow]"]) +@pytest.mark.parametrize( + ("name", "columns"), + [ + ("birdstrikes", "Flight Date"), + ("cars", "Year"), + ("co2-concentration", "Date"), + ("crimea", "date"), + ("football", "date"), + ("iowa-electricity", "year"), + ("la-riots", "death_date"), + ("ohlc", "date"), + ("seattle-weather-hourly-normals", "date"), + ("seattle-weather", "date"), + ("sp500-2000", "date"), + ("unemployment-across-industries", "date"), + ("us-employment", "month"), + ], +) +def test_pandas_date_parse( + backend: _PandasAny, + name: Dataset, + columns: OneOrSeq[str], + polars_loader: Loader[pl.DataFrame, pl.LazyFrame], +) -> None: + """ + Ensure schema defaults are correctly parsed. + + NOTE: + - Depends on ``frictionless`` being able to detect the date/datetime columns. + - Not all format strings work + """ + date_columns: list[str] = [columns] if isinstance(columns, str) else list(columns) + + load = Loader.from_backend(backend) + url = load.url(name) + kwds: dict[str, Any] = ( + {"convert_dates": date_columns} + if url.endswith(".json") + else {"parse_dates": date_columns} + ) + kwds_empty: dict[str, Any] = {k: [] for k in kwds} + + df_schema_derived: pd.DataFrame = load(name) + nw_schema = nw.from_native(df_schema_derived).schema + + df_manually_specified: pd.DataFrame = load(name, **kwds) + df_dates_empty: pd.DataFrame = load(name, **kwds_empty) + + assert set(date_columns).issubset(nw_schema) + for column in date_columns: + assert nw_schema[column] in {nw.Date, nw.Datetime} + + assert nw_schema == nw.from_native(df_manually_specified).schema + assert nw_schema != nw.from_native(df_dates_empty).schema + + # NOTE: Checking `polars` infers the same[1] as what `pandas` needs a hint for + # [1] Doesn't need to be exact, just recognise as *some kind* of date/datetime + pl_schema: pl.Schema = polars_loader(name).schema + for column in date_columns: + assert pl_schema[column].is_temporal() diff --git a/tools/datasets/__init__.py b/tools/datasets/__init__.py index 1402a9c7b..66c31e6f6 100644 --- a/tools/datasets/__init__.py +++ b/tools/datasets/__init__.py @@ -39,7 +39,15 @@ else: from typing_extensions import TypeAlias - _PathAlias: TypeAlias = Literal["npm_tags", "gh_tags", "gh_trees"] + _PathAlias: TypeAlias = Literal[ + "npm_tags", + "gh_tags", + "gh_trees", + "typing", + "url", + "dpkg_features", + "dpkg_schemas", + ] __all__ = ["app"] @@ -102,15 +110,17 @@ def __init__( npm_cdn_url=self._npm.url.CDN, **kwds_gh, ) - self._paths = types.MappingProxyType["_PathAlias", Path]( + self.paths = types.MappingProxyType["_PathAlias", Path]( { "npm_tags": self.npm._paths["tags"], "gh_tags": self.github._paths["tags"], "gh_trees": self.github._paths["trees"], + "typing": out_fp_typing, + "url": out_dir_altair / "url.csv.gz", + "dpkg_features": out_dir_altair / "datapackage_features.parquet", + "dpkg_schemas": out_dir_altair / "datapackage_schemas.json.gz", } ) - self._fp_typing: Path = out_fp_typing - self._fp_url: Path = out_dir_altair / "url.csv.gz" @property def github(self) -> GitHub: @@ -131,13 +141,13 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: """ print("Syncing datasets ...") npm_tags = self.npm.tags() - self.write_parquet(npm_tags, self._paths["npm_tags"]) + self.write_parquet(npm_tags, self.paths["npm_tags"]) gh_tags = self.github.refresh_tags(npm_tags) - self.write_parquet(gh_tags, self._paths["gh_tags"]) + self.write_parquet(gh_tags, self.paths["gh_tags"]) gh_trees = self.github.refresh_trees(gh_tags) - self.write_parquet(gh_trees, self._paths["gh_trees"]) + self.write_parquet(gh_trees, self.paths["gh_trees"]) npm_urls_min = ( gh_trees.lazy() @@ -145,31 +155,29 @@ def refresh(self, *, include_typing: bool = False) -> pl.DataFrame: .filter(col("size") == col("size").min().over("dataset_name")) .select("dataset_name", "url_npm") ) - self.write_csv_gzip(npm_urls_min, self._fp_url) + self.write_csv_gzip(npm_urls_min, self.paths["url"]) + + package = self.npm.datapackage() + # TODO: Re-enable after deciding on how best to utilize + # self.write_parquet(package["features"], self.paths["dpkg_features"]) + self.write_json_gzip(package["schemas"], self.paths["dpkg_schemas"]) if include_typing: - self.generate_typing(self._fp_typing) + self.generate_typing() return gh_trees def reset(self) -> None: """Remove all metadata files.""" - for fp in self._paths.values(): + for fp in self.paths.values(): fp.unlink(missing_ok=True) def read(self, name: _PathAlias, /) -> pl.DataFrame: """Read existing metadata from file.""" - return pl.read_parquet(self._from_alias(name)) + return pl.read_parquet(self.paths[name]) def scan(self, name: _PathAlias, /) -> pl.LazyFrame: """Scan existing metadata from file.""" - return pl.scan_parquet(self._from_alias(name)) - - def _from_alias(self, name: _PathAlias, /) -> Path: - if name not in {"npm_tags", "gh_tags", "gh_trees"}: - msg = f'Expected one of {["npm_tags", "gh_tags", "gh_trees"]!r}, but got: {name!r}' - raise TypeError(msg) - else: - return self._paths[name] + return pl.scan_parquet(self.paths[name]) def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """ @@ -193,6 +201,21 @@ def write_csv_gzip(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> Non df.write_csv(buf) f.write(buf.getbuffer()) + def write_json_gzip(self, obj: Any, fp: Path, /) -> None: + """ + Write ``obj`` as a `gzip`_ compressed ``json`` file. + + .. _gzip: + https://docs.python.org/3/library/gzip.html + """ + if fp.suffix != ".gz": + fp = fp.with_suffix(".json.gz") + if not fp.exists(): + fp.touch() + + with gzip.GzipFile(fp, mode="wb", mtime=0) as f: + f.write(json.dumps(obj).encode()) + def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None: """Write ``frame`` to ``fp``, with some extra safety.""" if not fp.exists(): @@ -207,7 +230,7 @@ def write_parquet(self, frame: pl.DataFrame | pl.LazyFrame, fp: Path, /) -> None with fp_schema.open("w") as f: json.dump(schema, f, indent=2) - def generate_typing(self, output: Path, /) -> None: + def generate_typing(self) -> None: from tools.generate_schema_wrapper import UNIVERSAL_TYPED_DICT tags = self.scan("gh_tags").select("tag").collect().to_series() @@ -314,6 +337,20 @@ def generate_typing(self, output: Path, /) -> None: f"{textwrap.indent(textwrap.dedent(examples), indent)}" ) + FIELD = "FlFieldStr" + FIELD_TYPES = ( + "integer", + "number", + "boolean", + "string", + "object", + "array", + "date", + "datetime", + "time", + "duration", + ) + contents = ( f"{HEADER_COMMENT}", "from __future__ import annotations\n", @@ -341,8 +378,14 @@ def generate_typing(self, output: Path, /) -> None: doc=metadata_doc, comment="", ), + f"{FIELD}: TypeAlias = {utils.spell_literal(FIELD_TYPES)}\n" + '"""\n' + "String representation of `frictionless`_ `Field Types`_.\n\n" + f".. _frictionless:\n{indent}https://github.com/frictionlessdata/frictionless-py\n" + f".. _Field Types:\n{indent}https://datapackage.org/standard/table-schema/#field-types\n" + '"""\n', ) - ruff.write_lint_format(output, contents) + ruff.write_lint_format(self.paths["typing"], contents) _alt_datasets = Path(__file__).parent.parent.parent / "altair" / "datasets" diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py new file mode 100644 index 000000000..da1f8375e --- /dev/null +++ b/tools/datasets/datapackage.py @@ -0,0 +1,133 @@ +""" +``frictionless`` `datapackage`_ parsing. + +.. _datapackage: + https://datapackage.org/ +""" + +from __future__ import annotations + +from collections import deque +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal, get_args + +import polars as pl +from polars import col +from polars import selectors as cs + +from tools.datasets.models import ParsedPackage +from tools.schemapi import utils + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator, Mapping, Sequence + + from altair.datasets._typing import Dataset, FlFieldStr + from tools.datasets.models import FlPackage + + +__all__ = ["parse_package"] + + +DATASET_NAME: Literal["dataset_name"] = "dataset_name" + +# # NOTE: Flag columns +# Storing these instead of the full **56KB** `datapackage.json` +FEATURES: Sequence[pl.Expr] = ( + (col("format") == "png").alias("is_image"), + (col("type") == "table").alias("is_tabular"), + (col("format") == "geojson").alias("is_geo"), + (col("format") == "topojson").alias("is_topo"), + col("format").is_in(("geojson", "topojson")).alias("is_spatial"), + (col("format").str.contains("json")).alias("is_json"), +) + + +def parse_package(pkg: FlPackage, /) -> ParsedPackage: + return ParsedPackage(features=extract_features(pkg), schemas=extract_schemas(pkg)) + + +def extract_schemas(pkg: FlPackage, /) -> Mapping[Dataset, Mapping[str, FlFieldStr]]: + """Reduce all datasets with schemas to a minimal mapping.""" + m: Any = { + Path(rsrc["path"]).stem: {f["name"]: f["type"] for f in s["fields"]} + for rsrc in pkg["resources"] + if (s := rsrc.get("schema")) + } + return m + + +def extract_features(pkg: FlPackage, /) -> pl.DataFrame: + # NOTE: `is_name_collision` != `GitHub.trees`/`Metadata.name_collision` + # - This only considers latest version + # - Those others are based on whatever tag the tree refers to + # https://github.com/vega/vega-datasets/issues/633 + EXCLUDE = ( + "name", + "type", + "format", + "scheme", + "mediatype", + "encoding", + "dialect", + "schema", + ) + return ( + pl.LazyFrame(pkg["resources"]) + .with_columns( + path_stem("path").alias(DATASET_NAME), + cs.exclude("name"), + col("name").is_duplicated().alias("is_name_collision"), + ) + .select( + DATASET_NAME, + path_suffix("path").alias("suffix"), + ~cs.by_name(DATASET_NAME, EXCLUDE), + *FEATURES, + col("schema").is_not_null().alias("has_schema"), + ) + .collect() + ) + + +def path_stem(column: str | pl.Expr, /) -> pl.Expr: + """ + The final path component, minus its last suffix. + + Needed since `Resource.name`_ must be lowercase. + + .. _Resource.name: + https://specs.frictionlessdata.io/data-resource/#name + """ + path = col(column) if isinstance(column, str) else column + rfind = (path.str.len_bytes() - 1) - path.str.reverse().str.find(r"\.") + return path.str.head(rfind) + + +def path_suffix(column: str | pl.Expr, /) -> pl.Expr: + """ + The final component's last suffix. + + This includes the leading period. For example: '.txt'. + """ + path = col(column) if isinstance(column, str) else column + return path.str.tail(path.str.reverse().str.find(r"\.") + 1) + + +def features_typing(frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]: + guards = deque[str]() + ldf = frame.lazy() + for feat in FEATURES: + guard_name = feat.meta.output_name() + alias_name = guard_name.removeprefix("is_").capitalize() + members = ldf.filter(guard_name).select(DATASET_NAME).collect().to_series() + guards.append(guard_literal(alias_name, guard_name, members)) + yield f"{alias_name}: TypeAlias = {utils.spell_literal(members)}" + yield from guards + + +def guard_literal(alias_name: str, guard_name: str, members: Iterable[str], /) -> str: + """Type narrowing function, all members must be literal strings.""" + return ( + f"def {guard_name}(obj: Any) -> TypeIs[{alias_name}]:\n" + f" return obj in set({sorted(set(members))!r})\n" + ) diff --git a/tools/datasets/models.py b/tools/datasets/models.py index 449c412ef..a454ed30c 100644 --- a/tools/datasets/models.py +++ b/tools/datasets/models.py @@ -3,7 +3,8 @@ from __future__ import annotations import sys -from typing import TYPE_CHECKING, Literal, NamedTuple +from collections.abc import Mapping, Sequence +from typing import TYPE_CHECKING, Any, Literal, NamedTuple if sys.version_info >= (3, 14): from typing import TypedDict @@ -14,9 +15,18 @@ import time if sys.version_info >= (3, 11): - from typing import LiteralString, Required + from typing import LiteralString, NotRequired, Required else: - from typing_extensions import LiteralString, Required + from typing_extensions import LiteralString, NotRequired, Required + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + import polars as pl + + from altair.datasets._typing import Dataset, FlFieldStr + +Map: TypeAlias = Mapping[str, Any] class GitHubUrl(NamedTuple): @@ -31,6 +41,7 @@ class GitHubUrl(NamedTuple): class NpmUrl(NamedTuple): CDN: LiteralString TAGS: LiteralString + GH: LiteralString class GitHubTag(TypedDict): @@ -178,3 +189,80 @@ class GitHubRateLimitResources(TypedDict, total=False): graphql: GitHubRateLimit integration_manifest: GitHubRateLimit code_search: GitHubRateLimit + + +##################################################### +# frictionless datapackage +##################################################### + + +FlCsvDialect: TypeAlias = Mapping[ + Literal["csv"], Mapping[Literal["delimiter"], Literal["\t"]] +] +FlJsonDialect: TypeAlias = Mapping[ + Literal[r"json"], Mapping[Literal["keyed"], Literal[True]] +] + + +class FlField(TypedDict): + """https://datapackage.org/standard/table-schema/#field.""" + + name: str + type: FlFieldStr + + +class FlSchema(TypedDict): + """https://datapackage.org/standard/table-schema/#properties.""" + + fields: Sequence[FlField] + + +class FlResource(TypedDict): + """https://datapackage.org/standard/data-resource/#properties.""" + + name: Dataset + type: Literal["table", "file", r"json"] + path: str + format: Literal[ + "arrow", "csv", "geojson", r"json", "parquet", "png", "topojson", "tsv" + ] + mediatype: Literal[ + "application/parquet", + "application/vnd.apache.arrow.file", + "image/png", + "text/csv", + "text/tsv", + r"text/json", + "text/geojson", + "text/topojson", + ] + schema: NotRequired[FlSchema] + scheme: Literal["file"] + dialect: NotRequired[FlCsvDialect | FlJsonDialect] + encoding: NotRequired[Literal["utf-8"]] + + +class FlPackage(TypedDict): + """ + A subset of the `Data Package`_ standard. + + .. _Data Package: + https://datapackage.org/standard/data-package/#properties + """ + + name: Literal["vega-datasets"] + version: str + homepage: str + description: str + licenses: Sequence[Map] + contributors: Sequence[Map] + sources: Sequence[Map] + created: str + resources: Sequence[FlResource] + + +class ParsedPackage(TypedDict): + """Minimal representations to write to disk.""" + + features: pl.DataFrame + schemas: Mapping[Dataset, Mapping[str, FlFieldStr]] diff --git a/tools/datasets/npm.py b/tools/datasets/npm.py index a5f068082..f71037d5c 100644 --- a/tools/datasets/npm.py +++ b/tools/datasets/npm.py @@ -2,23 +2,28 @@ import json import urllib.request -from typing import TYPE_CHECKING, ClassVar, Literal +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Literal import polars as pl -from tools.datasets import semver +from tools.datasets import datapackage, semver from tools.datasets.models import NpmUrl if TYPE_CHECKING: import sys - from pathlib import Path from urllib.request import OpenerDirector if sys.version_info >= (3, 11): from typing import LiteralString else: from typing_extensions import LiteralString - from tools.datasets.models import NpmPackageMetadataResponse + from altair.datasets._typing import Version + from tools.datasets.models import ( + FlPackage, + NpmPackageMetadataResponse, + ParsedPackage, + ) __all__ = ["Npm"] @@ -46,6 +51,7 @@ def __init__( self._url: NpmUrl = NpmUrl( CDN=f"https://cdn.{jsdelivr}.net/{npm}/{package}@", TAGS=f"https://data.{jsdelivr}.com/{jsdelivr_version}/packages/{npm}/{package}", + GH=f"https://cdn.{jsdelivr}.net/gh/vega/{package}@", ) @property @@ -78,3 +84,43 @@ def tags(self) -> pl.DataFrame: if (tag := v["version"]) and semver.CANARY not in tag ] return pl.DataFrame({"tag": versions}).pipe(semver.with_columns) + + def file_gh( + self, + branch_or_tag: Literal["main"] | Version | LiteralString, + path: str, + /, + ) -> Any: + """ + Request a file from the `jsdelivr GitHub`_ endpoint. + + Parameters + ---------- + branch_or_tag + Version of the file, see `branches`_ and `tags`_. + path + Relative filepath from the root of the repo. + + .. _jsdelivr GitHub: + https://www.jsdelivr.com/documentation#id-github + .. _branches: + https://github.com/vega/vega-datasets/branches + .. _tags: + https://github.com/vega/vega-datasets/tags + """ + path = path.lstrip("./") + suffix = Path(path).suffix + if suffix == ".json": + headers = {"Accept": "application/json"} + read_fn = json.load + else: + raise NotImplementedError(path, suffix) + req = urllib.request.Request( + f"{self.url.GH}{branch_or_tag}/{path}", headers=headers + ) + with self._opener.open(req) as response: + return read_fn(response) + + def datapackage(self, *, tag: LiteralString | None = None) -> ParsedPackage: + pkg: FlPackage = self.file_gh(tag or "main", "datapackage.json") + return datapackage.parse_package(pkg) From 9274284a16962c55df1faff2db20ec1e0d55313f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 16:08:48 +0000 Subject: [PATCH 133/137] refactor(ruff): Apply `TC006` fixes in new code Related https://github.com/vega/altair/pull/3706 --- tests/test_datasets.py | 2 +- tools/datasets/datapackage.py | 2 +- tools/datasets/github.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 9d91c275e..f9dd4c5a3 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -493,7 +493,7 @@ def test_reader_cache( if nw_dep.is_polars_dataframe(lookup_groups): left, right = ( lookup_groups, - cast(pl.DataFrame, data("lookup_groups", tag="v2.5.3")), + cast("pl.DataFrame", data("lookup_groups", tag="v2.5.3")), ) else: left, right = ( diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index da1f8375e..deb63fbb9 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -9,7 +9,7 @@ from collections import deque from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, get_args +from typing import TYPE_CHECKING, Any, Literal import polars as pl from polars import col diff --git a/tools/datasets/github.py b/tools/datasets/github.py index b9b156c60..406eca3dc 100644 --- a/tools/datasets/github.py +++ b/tools/datasets/github.py @@ -487,4 +487,4 @@ def _iter_rows(df: pl.DataFrame, stop: int | None, /, tp: type[_TD]) -> Iterator if not TYPE_CHECKING: assert is_typeddict(tp) or issubclass(tp, Mapping) - return cast(Iterator[_TD], islice(df.iter_rows(named=True), stop)) + return cast("Iterator[_TD]", islice(df.iter_rows(named=True), stop)) From 8e232b8d38d39c2832e64f5b959482585c4cc4e3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 17:01:45 +0000 Subject: [PATCH 134/137] docs(DRAFT): Add notes on `datapackage.features_typing` --- tools/datasets/datapackage.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/datasets/datapackage.py b/tools/datasets/datapackage.py index deb63fbb9..445974795 100644 --- a/tools/datasets/datapackage.py +++ b/tools/datasets/datapackage.py @@ -114,6 +114,15 @@ def path_suffix(column: str | pl.Expr, /) -> pl.Expr: def features_typing(frame: pl.LazyFrame | pl.DataFrame, /) -> Iterator[str]: + """ + Current plan is to use type aliases in overloads. + + - ``Tabular`` can be treated interchangeably + - ``Image`` can only work with ``url`` + - ``(Spatial|Geo|Topo)`` can be read with ``polars`` + - A future version may implement dedicated support https://github.com/vega/altair/pull/3631#discussion_r1845931955 + - ``Json`` should warn when using the ``pyarrow`` backend + """ guards = deque[str]() ldf = frame.lazy() for feat in FEATURES: From 93308958fbf40873fc4023d6b20e1e81bc97d5ab Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 18:16:04 +0000 Subject: [PATCH 135/137] docs: Update `Loader.from_backend` example w/ dtypes Related https://github.com/vega/altair/pull/3631/commits/909e7d05e57718b2f634a7e6781cb4e58a835837 --- altair/datasets/_loader.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index ce2559aed..f9190f789 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -117,15 +117,15 @@ def from_backend(cls, backend_name: _Backend, /) -> Loader[Any, Any]: pandas.core.frame.DataFrame >>> cars.dtypes # doctest: +SKIP - Name string[pyarrow] - Miles_per_Gallon double[pyarrow] - Cylinders int64[pyarrow] - Displacement double[pyarrow] - Horsepower int64[pyarrow] - Weight_in_lbs int64[pyarrow] - Acceleration double[pyarrow] - Year string[pyarrow] - Origin string[pyarrow] + Name string[pyarrow] + Miles_per_Gallon double[pyarrow] + Cylinders int64[pyarrow] + Displacement double[pyarrow] + Horsepower int64[pyarrow] + Weight_in_lbs int64[pyarrow] + Acceleration double[pyarrow] + Year timestamp[ns][pyarrow] + Origin string[pyarrow] dtype: object """ obj = Loader.__new__(Loader) From caf534da20f9b96187283d67a458f17c0b0346bb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 18:27:33 +0000 Subject: [PATCH 136/137] feat: Use `_pl_read_json_roundtrip` instead of `pl.read_json` for `pyarrow` Provides better dtype inference --- altair/datasets/_readers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/altair/datasets/_readers.py b/altair/datasets/_readers.py index 5b9829b9e..e2607acbc 100644 --- a/altair/datasets/_readers.py +++ b/altair/datasets/_readers.py @@ -380,10 +380,9 @@ def __init__(self, name: _PyArrow, /) -> None: # ------------------------------------------------------- # NOTE: Prefer `polars` since it is zero-copy and fast (1) if find_spec("polars") is not None: - import polars as pl def pa_read_json(source: StrPath, /, **kwds) -> pa.Table: - return pl.read_json(source).to_arrow() + return _pl_read_json_roundtrip(source).to_arrow() else: # NOTE: Convert inline from stdlib json (2) From 75bf2bad9d5d8f59c6084f1f58686085409f604c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 2 Dec 2024 18:57:03 +0000 Subject: [PATCH 137/137] docs: Replace example dataset Switching to one with a timestamp that `frictionless` recognises https://github.com/vega/vega-datasets/blob/8745f5c61ba951fe057a42562b8b88604b4a3735/datapackage.json#L2674-L2689 https://github.com/vega/vega-datasets/blob/8745f5c61ba951fe057a42562b8b88604b4a3735/datapackage.json#L45-L57 --- altair/datasets/_loader.py | 88 +++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/altair/datasets/_loader.py b/altair/datasets/_loader.py index f9190f789..2b8a2cd95 100644 --- a/altair/datasets/_loader.py +++ b/altair/datasets/_loader.py @@ -171,72 +171,72 @@ def __call__( from altair.datasets import Loader data = Loader.from_backend("polars") - source = data("stocks", tag="v2.10.0") + source = data("iowa-electricity", tag="v2.10.0") >>> source.columns # doctest: +SKIP - ['symbol', 'date', 'price'] + ['year', 'source', 'net_generation'] >>> source # doctest: +SKIP - shape: (560, 3) - ┌────────┬────────────┬────────┐ - │ symbol ┆ date ┆ price │ - │ --- ┆ --- ┆ --- │ - │ str ┆ str ┆ f64 │ - ╞════════╪════════════╪════════╡ - │ MSFT ┆ Jan 1 2000 ┆ 39.81 │ - │ MSFT ┆ Feb 1 2000 ┆ 36.35 │ - │ MSFT ┆ Mar 1 2000 ┆ 43.22 │ - │ MSFT ┆ Apr 1 2000 ┆ 28.37 │ - │ MSFT ┆ May 1 2000 ┆ 25.45 │ - │ … ┆ … ┆ … │ - │ AAPL ┆ Nov 1 2009 ┆ 199.91 │ - │ AAPL ┆ Dec 1 2009 ┆ 210.73 │ - │ AAPL ┆ Jan 1 2010 ┆ 192.06 │ - │ AAPL ┆ Feb 1 2010 ┆ 204.62 │ - │ AAPL ┆ Mar 1 2010 ┆ 223.02 │ - └────────┴────────────┴────────┘ + shape: (51, 3) + ┌────────────┬──────────────┬────────────────┐ + │ year ┆ source ┆ net_generation │ + │ --- ┆ --- ┆ --- │ + │ date ┆ str ┆ i64 │ + ╞════════════╪══════════════╪════════════════╡ + │ 2001-01-01 ┆ Fossil Fuels ┆ 35361 │ + │ 2002-01-01 ┆ Fossil Fuels ┆ 35991 │ + │ 2003-01-01 ┆ Fossil Fuels ┆ 36234 │ + │ 2004-01-01 ┆ Fossil Fuels ┆ 36205 │ + │ 2005-01-01 ┆ Fossil Fuels ┆ 36883 │ + │ … ┆ … ┆ … │ + │ 2013-01-01 ┆ Renewables ┆ 16476 │ + │ 2014-01-01 ┆ Renewables ┆ 17452 │ + │ 2015-01-01 ┆ Renewables ┆ 19091 │ + │ 2016-01-01 ┆ Renewables ┆ 21241 │ + │ 2017-01-01 ┆ Renewables ┆ 21933 │ + └────────────┴──────────────┴────────────────┘ Using ``pandas``: data = Loader.from_backend("pandas") - source = data("stocks", tag="v2.10.0") + source = data("iowa-electricity", tag="v2.10.0") >>> source.columns # doctest: +SKIP - Index(['symbol', 'date', 'price'], dtype='object') + Index(['year', 'source', 'net_generation'], dtype='object') >>> source # doctest: +SKIP - symbol date price - 0 MSFT Jan 1 2000 39.81 - 1 MSFT Feb 1 2000 36.35 - 2 MSFT Mar 1 2000 43.22 - 3 MSFT Apr 1 2000 28.37 - 4 MSFT May 1 2000 25.45 - .. ... ... ... - 555 AAPL Nov 1 2009 199.91 - 556 AAPL Dec 1 2009 210.73 - 557 AAPL Jan 1 2010 192.06 - 558 AAPL Feb 1 2010 204.62 - 559 AAPL Mar 1 2010 223.02 - - [560 rows x 3 columns] + year source net_generation + 0 2001-01-01 Fossil Fuels 35361 + 1 2002-01-01 Fossil Fuels 35991 + 2 2003-01-01 Fossil Fuels 36234 + 3 2004-01-01 Fossil Fuels 36205 + 4 2005-01-01 Fossil Fuels 36883 + .. ... ... ... + 46 2013-01-01 Renewables 16476 + 47 2014-01-01 Renewables 17452 + 48 2015-01-01 Renewables 19091 + 49 2016-01-01 Renewables 21241 + 50 2017-01-01 Renewables 21933 + + [51 rows x 3 columns] Using ``pyarrow``: data = Loader.from_backend("pyarrow") - source = data("stocks", tag="v2.10.0") + source = data("iowa-electricity", tag="v2.10.0") >>> source.column_names # doctest: +SKIP - ['symbol', 'date', 'price'] + ['year', 'source', 'net_generation'] >>> source # doctest: +SKIP pyarrow.Table - symbol: string - date: string - price: double + year: date32[day] + source: string + net_generation: int64 ---- - symbol: [["MSFT","MSFT","MSFT","MSFT","MSFT",...,"AAPL","AAPL","AAPL","AAPL","AAPL"]] - date: [["Jan 1 2000","Feb 1 2000","Mar 1 2000","Apr 1 2000","May 1 2000",...,"Nov 1 2009","Dec 1 2009","Jan 1 2010","Feb 1 2010","Mar 1 2010"]] - price: [[39.81,36.35,43.22,28.37,25.45,...,199.91,210.73,192.06,204.62,223.02]] + year: [[2001-01-01,2002-01-01,2003-01-01,2004-01-01,2005-01-01,...,2013-01-01,2014-01-01,2015-01-01,2016-01-01,2017-01-01]] + source: [["Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels","Fossil Fuels",...,"Renewables","Renewables","Renewables","Renewables","Renewables"]] + net_generation: [[35361,35991,36234,36205,36883,...,16476,17452,19091,21241,21933]] """ return self._reader.dataset(name, suffix, tag=tag, **kwds)