zarr-developers · TomNicholas · Aug 27, 2024 · Jun 29, 2024 · Jun 29, 2024 · Jul 9, 2024
diff --git a/conftest.py b/conftest.py
@@ -32,6 +32,19 @@ def netcdf4_file(tmpdir):
     return filepath
 
 
+@pytest.fixture
+def hdf5_groups_file(tmpdir):
+    # Set up example xarray dataset
+    ds = xr.tutorial.open_dataset("air_temperature")
+
+    # Save it to disk as netCDF (in temporary directory)
+    filepath = f"{tmpdir}/air.nc"
+    ds.to_netcdf(filepath, format="NETCDF4", group="test/group")
+    ds.close()
+
+    return filepath
+
+
 @pytest.fixture
 def netcdf4_files(tmpdir):
     # Set up example xarray dataset

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -8,6 +8,8 @@ v1.0.1 (unreleased)
 
 New Features
 ~~~~~~~~~~~~
+- New ``group`` option on ``open_virtual_dataset`` enables extracting specific HDF Groups.
+  (:pull:`165`) By `Scott Henderson <https://github.com/scottyhq>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~

diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -18,12 +18,13 @@
 # (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html)
 # TODO I would prefer to be more specific about these types
 KerchunkStoreRefs = NewType(
-    "KerchunkStoreRefs", dict
-)  # top-level dict with keys for 'version', 'refs'
+    "KerchunkStoreRefs",
+    dict,  # dict_keys(['version', 'refs'])
+)  # top-level dict containing kerchunk version and 'refs' dictionary which assumes single '.zgroup' key and multiple KerchunkArrRefs
 KerchunkArrRefs = NewType(
     "KerchunkArrRefs",
-    dict,
-)  # lower-level dict containing just the information for one zarr array
+    dict,  # dict_keys(['.zarray', '.zattrs', '0.0', '0.1', ...)
+)  # lower-level dict defining a single Zarr Array, with keys for '.zarray', '.zattrs', and every chunk
 
 
 class AutoName(Enum):
@@ -59,6 +60,7 @@ def default(self, obj):
 def read_kerchunk_references_from_file(
     filepath: str,
     filetype: FileType | None,
+    group: str | None,
     reader_options: Optional[dict[str, Any]] = None,
 ) -> KerchunkStoreRefs:
     """
@@ -71,22 +73,21 @@ def read_kerchunk_references_from_file(
     filetype : FileType, default: None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
         If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
+    group : str, default is None
+        Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”.
     reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}}
         Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments,
         so ensure reader_options match selected Kerchunk reader arguments.
     """
-
     if filetype is None:
         filetype = _automatically_determine_filetype(
             filepath=filepath, reader_options=reader_options
         )
+    filetype = FileType(filetype)
 
     if reader_options is None:
         reader_options = {}
 
-    # if filetype is user defined, convert to FileType
-    filetype = FileType(filetype)
-
     if filetype.name.lower() == "netcdf3":
         from kerchunk.netCDF3 import NetCDF3ToZarr
 
@@ -98,6 +99,9 @@ def read_kerchunk_references_from_file(
         refs = SingleHdf5ToZarr(
             filepath, inline_threshold=0, **reader_options
         ).translate()
+
+        refs = extract_group(refs, group)
+
     elif filetype.name.lower() == "grib":
         # TODO Grib files should be handled as a DataTree object
         # see https://github.com/TomNicholas/VirtualiZarr/issues/11
@@ -125,6 +129,44 @@ def read_kerchunk_references_from_file(
     return refs
 
 
+def extract_group(vds_refs: KerchunkStoreRefs, group: str | None) -> KerchunkStoreRefs:
+    """Extract only the part of the kerchunk reference dict that is relevant to a single HDF group"""
+    hdf_groups = [
+        k.removesuffix(".zgroup") for k in vds_refs["refs"].keys() if ".zgroup" in k
+    ]
+    if len(hdf_groups) == 1:
+        return vds_refs
+    else:
+        if group is None:
+            raise ValueError(
+                f"Multiple HDF Groups found. Must specify group= keyword to select one of {hdf_groups}"
+            )
+        else:
+            # Ensure supplied group kwarg is consistent with kerchunk keys
+            if not group.endswith("/"):
+                group += "/"
+            if group.startswith("/"):
+                group = group.removeprefix("/")
+
+        if group not in hdf_groups:
+            raise ValueError(f'Group "{group}" not found in {hdf_groups}')
+
+        # Filter by group prefix and remove prefix from all keys
+        groupdict = {
+            k.removeprefix(group): v
+            for k, v in vds_refs["refs"].items()
+            if k.startswith(group)
+        }
+        # Also remove group prefix from _ARRAY_DIMENSIONS
+        for k, v in groupdict.items():
+            if isinstance(v, str):
+                groupdict[k] = v.replace("\\/", "/").replace(group, "")
+
+        vds_refs["refs"] = groupdict
+
+        return KerchunkStoreRefs(vds_refs)
+
+
 def _automatically_determine_filetype(
     *,
     filepath: str,
@@ -166,6 +208,7 @@ def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]:
 
     refs = ds_reference_dict["refs"]
     found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key}
+
     return list(found_var_names)
 
 
@@ -187,6 +230,7 @@ def extract_array_refs(
         }
 
         return fully_decode_arr_refs(arr_refs)
+
     else:
         raise KeyError(
             f"Could not find zarr array variable name {var_name}, only {found_var_names}"

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -1,6 +1,7 @@
 from collections.abc import Mapping
 from unittest.mock import patch
 
+import fsspec
 import numpy as np
 import pytest
 import xarray as xr
@@ -349,8 +350,10 @@ class TestReadFromURL:
                 "hdf4",
                 "https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
             ),
-            # https://github.com/zarr-developers/VirtualiZarr/issues/159
-            # ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"),
+            (
+                "hdf5",
+                "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5",
+            ),
             pytest.param(
                 "tiff",
                 "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
@@ -375,10 +378,48 @@ def test_read_from_url(self, filetype, url):
         if filetype in ["grib", "jpg", "hdf4"]:
             with pytest.raises(NotImplementedError):
                 vds = open_virtual_dataset(url, reader_options={}, indexes={})
+        elif filetype == "hdf5":
+            vds = open_virtual_dataset(
+                url,
+                group="science/LSAR/GCOV/grids/frequencyA",
+                drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
+                indexes={},
+                reader_options={},
+            )
+            assert isinstance(vds, xr.Dataset)
         else:
             vds = open_virtual_dataset(url, indexes={})
             assert isinstance(vds, xr.Dataset)
 
+    def test_virtualizarr_vs_local_nisar(self):
+        # Open group directly from locally cached file with xarray
+        url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5"
+        tmpfile = fsspec.open_local(
+            f"filecache::{url}", filecache=dict(cache_storage="/tmp", same_names=True)
+        )
+        hdf_group = "science/LSAR/GCOV/grids/frequencyA"
+        dsXR = xr.open_dataset(
+            tmpfile,
+            engine="h5netcdf",
+            group=hdf_group,
+            drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
+            phony_dims="access",
+        )
+
+        # save group reference file via virtualizarr, then open with engine="kerchunk"
+        vds = open_virtual_dataset(
+            tmpfile,
+            group=hdf_group,
+            indexes={},
+            drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
+        )
+        tmpref = "/tmp/cmip6.json"
+        vds.virtualize.to_kerchunk(tmpref, format="json")
+        dsV = xr.open_dataset(tmpref, engine="kerchunk")
+
+        # xrt.assert_identical(dsXR, dsV) #Attribute order changes
+        xrt.assert_equal(dsXR, dsV)
+
 
 class TestLoadVirtualDataset:
     def test_loadable_variables(self, netcdf4_file):
@@ -406,6 +447,26 @@ def test_explicit_filetype(self, netcdf4_file):
         with pytest.raises(NotImplementedError):
             open_virtual_dataset(netcdf4_file, filetype="grib")
 
+    def test_group_kwarg(self, hdf5_groups_file):
+        with pytest.raises(ValueError, match="Multiple HDF Groups found"):
+            open_virtual_dataset(hdf5_groups_file)
+        with pytest.raises(ValueError, match="not found in"):
+            open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
+
+        vars_to_load = ["air", "time"]
+        vds = open_virtual_dataset(
+            hdf5_groups_file,
+            group="test/group",
+            loadable_variables=vars_to_load,
+            indexes={},
+        )
+        full_ds = xr.open_dataset(
+            hdf5_groups_file, group="test/group", decode_times=False
+        )
+        for name in full_ds.variables:
+            if name in vars_to_load:
+                xrt.assert_identical(vds.variables[name], full_ds.variables[name])
+
     @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
     def test_open_virtual_dataset_passes_expected_args(
         self, mock_read_kerchunk, netcdf4_file
@@ -415,6 +476,7 @@ def test_open_virtual_dataset_passes_expected_args(
         args = {
             "filepath": netcdf4_file,
             "filetype": None,
+            "group": None,
             "reader_options": reader_options,
         }
         mock_read_kerchunk.assert_called_once_with(**args)