diff --git a/conftest.py b/conftest.py index b558abfd..32b3581f 100644 --- a/conftest.py +++ b/conftest.py @@ -33,6 +33,19 @@ def netcdf4_file(tmpdir): return filepath +@pytest.fixture +def hdf5_groups_file(tmpdir): + # Set up example xarray dataset + ds = xr.tutorial.open_dataset("air_temperature") + + # Save it to disk as netCDF (in temporary directory) + filepath = f"{tmpdir}/air.nc" + ds.to_netcdf(filepath, format="NETCDF4", group="test/group") + ds.close() + + return filepath + + @pytest.fixture def netcdf4_files(tmpdir): # Set up example xarray dataset diff --git a/docs/releases.rst b/docs/releases.rst index 39d517be..216b3b80 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -8,6 +8,8 @@ v1.0.1 (unreleased) New Features ~~~~~~~~~~~~ +- New ``group`` option on ``open_virtual_dataset`` enables extracting specific HDF Groups. + (:pull:`165`) By `Scott Henderson `_. - Adds `decode_times` to open_virtual_dataset (:pull:`232`) By `Raphael Hagen `_. diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index b155c2ce..73fabe74 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -51,6 +51,7 @@ def open_virtual_dataset( filepath: str, *, filetype: FileType | None = None, + group: str | None = None, drop_variables: Iterable[str] | None = None, loadable_variables: Iterable[str] | None = None, decode_times: bool | None = None, @@ -74,6 +75,8 @@ def open_virtual_dataset( Type of file to be opened. Used to determine which kerchunk file format backend to use. Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}. If not provided will attempt to automatically infer the correct filetype from header bytes. + group : str, default is None + Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”. drop_variables: list[str], default is None Variables in the file to drop before returning. loadable_variables: list[str], default is None @@ -171,6 +174,7 @@ def open_virtual_dataset( vds_refs = read_kerchunk_references_from_file( filepath=filepath, filetype=filetype, + group=group, reader_options=reader_options, ) virtual_vars = virtual_vars_from_kerchunk_refs( @@ -195,6 +199,7 @@ def open_virtual_dataset( ds = xr.open_dataset( cast(XArrayOpenT, fpath), drop_variables=drop_variables, + group=group, decode_times=decode_times, ) diff --git a/virtualizarr/readers/kerchunk.py b/virtualizarr/readers/kerchunk.py index 4686ce94..19a8c28d 100644 --- a/virtualizarr/readers/kerchunk.py +++ b/virtualizarr/readers/kerchunk.py @@ -57,6 +57,7 @@ def _automatically_determine_filetype( def read_kerchunk_references_from_file( filepath: str, filetype: FileType | None, + group: str | None, reader_options: Optional[dict[str, Any]] = None, ) -> KerchunkStoreRefs: """ @@ -69,7 +70,8 @@ def read_kerchunk_references_from_file( filetype : FileType, default: None Type of file to be opened. Used to determine which kerchunk file format backend to use. If not provided will attempt to automatically infer the correct filetype from the the filepath's extension. - reader_options: dict, default {} + group : str, default is None + Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”. Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments, so ensure reader_options match selected Kerchunk reader arguments. """ @@ -96,6 +98,9 @@ def read_kerchunk_references_from_file( refs = SingleHdf5ToZarr( filepath, inline_threshold=0, **reader_options ).translate() + + refs = extract_group(refs, group) + elif filetype.name.lower() == "grib": # TODO Grib files should be handled as a DataTree object # see https://github.com/TomNicholas/VirtualiZarr/issues/11 @@ -123,6 +128,44 @@ def read_kerchunk_references_from_file( return refs +def extract_group(vds_refs: KerchunkStoreRefs, group: str | None) -> KerchunkStoreRefs: + """Extract only the part of the kerchunk reference dict that is relevant to a single HDF group""" + hdf_groups = [ + k.removesuffix(".zgroup") for k in vds_refs["refs"].keys() if ".zgroup" in k + ] + if len(hdf_groups) == 1: + return vds_refs + else: + if group is None: + raise ValueError( + f"Multiple HDF Groups found. Must specify group= keyword to select one of {hdf_groups}" + ) + else: + # Ensure supplied group kwarg is consistent with kerchunk keys + if not group.endswith("/"): + group += "/" + if group.startswith("/"): + group = group.removeprefix("/") + + if group not in hdf_groups: + raise ValueError(f'Group "{group}" not found in {hdf_groups}') + + # Filter by group prefix and remove prefix from all keys + groupdict = { + k.removeprefix(group): v + for k, v in vds_refs["refs"].items() + if k.startswith(group) + } + # Also remove group prefix from _ARRAY_DIMENSIONS + for k, v in groupdict.items(): + if isinstance(v, str): + groupdict[k] = v.replace("\\/", "/").replace(group, "") + + vds_refs["refs"] = groupdict + + return KerchunkStoreRefs(vds_refs) + + def virtual_vars_from_kerchunk_refs( refs: KerchunkStoreRefs, drop_variables: list[str] | None = None, @@ -214,6 +257,7 @@ def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: refs = ds_reference_dict["refs"] found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key} + return list(found_var_names) @@ -235,6 +279,7 @@ def extract_array_refs( } return fully_decode_arr_refs(arr_refs) + else: raise KeyError( f"Could not find zarr array variable name {var_name}, only {found_var_names}" diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index c3001587..e42ad9ac 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -1,6 +1,7 @@ from collections.abc import Mapping from unittest.mock import patch +import fsspec import numpy as np import pytest import xarray as xr @@ -192,6 +193,10 @@ class TestReadFromURL: "hdf4", "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf", ), + ( + "hdf5", + "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5", + ), # https://github.com/zarr-developers/VirtualiZarr/issues/159 # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"), pytest.param( @@ -218,10 +223,48 @@ def test_read_from_url(self, filetype, url): if filetype in ["grib", "jpg", "hdf4"]: with pytest.raises(NotImplementedError): vds = open_virtual_dataset(url, reader_options={}, indexes={}) + elif filetype == "hdf5": + vds = open_virtual_dataset( + url, + group="science/LSAR/GCOV/grids/frequencyA", + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + indexes={}, + reader_options={}, + ) + assert isinstance(vds, xr.Dataset) else: vds = open_virtual_dataset(url, indexes={}) assert isinstance(vds, xr.Dataset) + def test_virtualizarr_vs_local_nisar(self): + # Open group directly from locally cached file with xarray + url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5" + tmpfile = fsspec.open_local( + f"filecache::{url}", filecache=dict(cache_storage="/tmp", same_names=True) + ) + hdf_group = "science/LSAR/GCOV/grids/frequencyA" + dsXR = xr.open_dataset( + tmpfile, + engine="h5netcdf", + group=hdf_group, + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + phony_dims="access", + ) + + # save group reference file via virtualizarr, then open with engine="kerchunk" + vds = open_virtual_dataset( + tmpfile, + group=hdf_group, + indexes={}, + drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + ) + tmpref = "/tmp/cmip6.json" + vds.virtualize.to_kerchunk(tmpref, format="json") + dsV = xr.open_dataset(tmpref, engine="kerchunk") + + # xrt.assert_identical(dsXR, dsV) #Attribute order changes + xrt.assert_equal(dsXR, dsV) + class TestLoadVirtualDataset: def test_loadable_variables(self, netcdf4_file): @@ -249,6 +292,27 @@ def test_explicit_filetype(self, netcdf4_file): with pytest.raises(NotImplementedError): open_virtual_dataset(netcdf4_file, filetype="grib") + def test_group_kwarg(self, hdf5_groups_file): + with pytest.raises(ValueError, match="Multiple HDF Groups found"): + open_virtual_dataset(hdf5_groups_file) + with pytest.raises(ValueError, match="not found in"): + open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") + + vars_to_load = ["air", "time"] + vds = open_virtual_dataset( + hdf5_groups_file, + group="test/group", + loadable_variables=vars_to_load, + indexes={}, + ) + full_ds = xr.open_dataset( + hdf5_groups_file, + group="test/group", + ) + for name in full_ds.variables: + if name in vars_to_load: + xrt.assert_identical(vds.variables[name], full_ds.variables[name]) + @patch("virtualizarr.readers.kerchunk.read_kerchunk_references_from_file") def test_open_virtual_dataset_passes_expected_args( self, mock_read_kerchunk, netcdf4_file @@ -258,6 +322,7 @@ def test_open_virtual_dataset_passes_expected_args( args = { "filepath": netcdf4_file, "filetype": None, + "group": None, "reader_options": reader_options, } mock_read_kerchunk.assert_called_once_with(**args) diff --git a/virtualizarr/types/kerchunk.py b/virtualizarr/types/kerchunk.py index e8dada20..d124cca3 100644 --- a/virtualizarr/types/kerchunk.py +++ b/virtualizarr/types/kerchunk.py @@ -4,9 +4,10 @@ # (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html) # TODO I would prefer to be more specific about these types KerchunkStoreRefs = NewType( - "KerchunkStoreRefs", dict -) # top-level dict with keys for 'version', 'refs' + "KerchunkStoreRefs", + dict, # dict_keys(['version', 'refs']) +) # top-level dict containing kerchunk version and 'refs' dictionary which assumes single '.zgroup' key and multiple KerchunkArrRefs KerchunkArrRefs = NewType( "KerchunkArrRefs", - dict, -) # lower-level dict containing just the information for one zarr array + dict, # dict_keys(['.zarray', '.zattrs', '0.0', '0.1', ...) +) # lower-level dict defining a single Zarr Array, with keys for '.zarray', '.zattrs', and every chunk