Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support specifying single HDF Group in open_virtual_dataset #165

Merged
merged 14 commits into from
Aug 27, 2024
13 changes: 13 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@ def netcdf4_file(tmpdir):
return filepath


@pytest.fixture
def hdf5_groups_file(tmpdir):
# Set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature")

# Save it to disk as netCDF (in temporary directory)
filepath = f"{tmpdir}/air.nc"
ds.to_netcdf(filepath, format="NETCDF4", group="test/group")
ds.close()

return filepath


@pytest.fixture
def netcdf4_files(tmpdir):
# Set up example xarray dataset
Expand Down
2 changes: 2 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ v1.0.1 (unreleased)

New Features
~~~~~~~~~~~~
- New ``group`` option on ``open_virtual_dataset`` enables extracting specific HDF Groups.
(:pull:`165`) By `Scott Henderson <https://github.com/scottyhq>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
60 changes: 52 additions & 8 deletions virtualizarr/kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
# (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html)
# TODO I would prefer to be more specific about these types
KerchunkStoreRefs = NewType(
"KerchunkStoreRefs", dict
) # top-level dict with keys for 'version', 'refs'
"KerchunkStoreRefs",
dict, # dict_keys(['version', 'refs'])
) # top-level dict containing kerchunk version and 'refs' dictionary which assumes single '.zgroup' key and multiple KerchunkArrRefs
KerchunkArrRefs = NewType(
"KerchunkArrRefs",
dict,
) # lower-level dict containing just the information for one zarr array
dict, # dict_keys(['.zarray', '.zattrs', '0.0', '0.1', ...)
) # lower-level dict defining a single Zarr Array, with keys for '.zarray', '.zattrs', and every chunk


class AutoName(Enum):
Expand Down Expand Up @@ -59,6 +60,7 @@ def default(self, obj):
def read_kerchunk_references_from_file(
filepath: str,
filetype: FileType | None,
group: str | None,
reader_options: Optional[dict[str, Any]] = None,
) -> KerchunkStoreRefs:
"""
Expand All @@ -71,22 +73,21 @@ def read_kerchunk_references_from_file(
filetype : FileType, default: None
Type of file to be opened. Used to determine which kerchunk file format backend to use.
If not provided will attempt to automatically infer the correct filetype from the the filepath's extension.
group : str, default is None
Path to the HDF5/netCDF4 group in the given file to open. Given as a str, supported by filetypes “netcdf4” and “hdf5”.
reader_options: dict, default {'storage_options':{'key':'', 'secret':'', 'anon':True}}
Dict passed into Kerchunk file readers. Note: Each Kerchunk file reader has distinct arguments,
so ensure reader_options match selected Kerchunk reader arguments.
"""

if filetype is None:
filetype = _automatically_determine_filetype(
filepath=filepath, reader_options=reader_options
)
filetype = FileType(filetype)

if reader_options is None:
reader_options = {}

# if filetype is user defined, convert to FileType
filetype = FileType(filetype)

if filetype.name.lower() == "netcdf3":
from kerchunk.netCDF3 import NetCDF3ToZarr

Expand All @@ -98,6 +99,9 @@ def read_kerchunk_references_from_file(
refs = SingleHdf5ToZarr(
filepath, inline_threshold=0, **reader_options
).translate()

refs = extract_group(refs, group)

elif filetype.name.lower() == "grib":
# TODO Grib files should be handled as a DataTree object
# see https://github.com/TomNicholas/VirtualiZarr/issues/11
Expand Down Expand Up @@ -125,6 +129,44 @@ def read_kerchunk_references_from_file(
return refs


def extract_group(vds_refs: KerchunkStoreRefs, group: str | None) -> KerchunkStoreRefs:
"""Extract only the part of the kerchunk reference dict that is relevant to a single HDF group"""
hdf_groups = [
k.removesuffix(".zgroup") for k in vds_refs["refs"].keys() if ".zgroup" in k
]
if len(hdf_groups) == 1:
return vds_refs
else:
if group is None:
raise ValueError(
f"Multiple HDF Groups found. Must specify group= keyword to select one of {hdf_groups}"
)
else:
# Ensure supplied group kwarg is consistent with kerchunk keys
if not group.endswith("/"):
group += "/"
if group.startswith("/"):
group = group.removeprefix("/")

if group not in hdf_groups:
raise ValueError(f'Group "{group}" not found in {hdf_groups}')

# Filter by group prefix and remove prefix from all keys
groupdict = {
k.removeprefix(group): v
for k, v in vds_refs["refs"].items()
if k.startswith(group)
}
# Also remove group prefix from _ARRAY_DIMENSIONS
for k, v in groupdict.items():
if isinstance(v, str):
groupdict[k] = v.replace("\\/", "/").replace(group, "")

vds_refs["refs"] = groupdict

return KerchunkStoreRefs(vds_refs)


def _automatically_determine_filetype(
*,
filepath: str,
Expand Down Expand Up @@ -166,6 +208,7 @@ def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]:

refs = ds_reference_dict["refs"]
found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key}

return list(found_var_names)


Expand All @@ -187,6 +230,7 @@ def extract_array_refs(
}

return fully_decode_arr_refs(arr_refs)

else:
raise KeyError(
f"Could not find zarr array variable name {var_name}, only {found_var_names}"
Expand Down
66 changes: 64 additions & 2 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections.abc import Mapping
from unittest.mock import patch

import fsspec
import numpy as np
import pytest
import xarray as xr
Expand Down Expand Up @@ -349,8 +350,10 @@ class TestReadFromURL:
"hdf4",
"https://github.com/corteva/rioxarray/raw/master/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf",
),
# https://github.com/zarr-developers/VirtualiZarr/issues/159
# ("hdf5", "https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/NEONDSTowerTemperatureData.hdf5"),
(
"hdf5",
"https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5",
),
pytest.param(
"tiff",
"https://github.com/fsspec/kerchunk/raw/main/kerchunk/tests/lcmap_tiny_cog_2020.tif",
Expand All @@ -375,10 +378,48 @@ def test_read_from_url(self, filetype, url):
if filetype in ["grib", "jpg", "hdf4"]:
with pytest.raises(NotImplementedError):
vds = open_virtual_dataset(url, reader_options={}, indexes={})
elif filetype == "hdf5":
vds = open_virtual_dataset(
url,
group="science/LSAR/GCOV/grids/frequencyA",
drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
indexes={},
reader_options={},
)
assert isinstance(vds, xr.Dataset)
else:
vds = open_virtual_dataset(url, indexes={})
assert isinstance(vds, xr.Dataset)

def test_virtualizarr_vs_local_nisar(self):
# Open group directly from locally cached file with xarray
url = "https://nisar.asf.earthdatacloud.nasa.gov/NISAR-SAMPLE-DATA/GCOV/ALOS1_Rosamond_20081012/NISAR_L2_PR_GCOV_001_005_A_219_4020_SHNA_A_20081012T060910_20081012T060926_P01101_F_N_J_001.h5"
tmpfile = fsspec.open_local(
f"filecache::{url}", filecache=dict(cache_storage="/tmp", same_names=True)
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not familiar with this fsspec function. Is this not something that can just be done with pathlib?

hdf_group = "science/LSAR/GCOV/grids/frequencyA"
dsXR = xr.open_dataset(
tmpfile,
engine="h5netcdf",
group=hdf_group,
drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
phony_dims="access",
)

# save group reference file via virtualizarr, then open with engine="kerchunk"
vds = open_virtual_dataset(
tmpfile,
group=hdf_group,
indexes={},
drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
)
tmpref = "/tmp/cmip6.json"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pytest has a fixture tmpdir - I think we just want to write to that?

vds.virtualize.to_kerchunk(tmpref, format="json")
dsV = xr.open_dataset(tmpref, engine="kerchunk")

# xrt.assert_identical(dsXR, dsV) #Attribute order changes
scottyhq marked this conversation as resolved.
Show resolved Hide resolved
xrt.assert_equal(dsXR, dsV)


class TestLoadVirtualDataset:
def test_loadable_variables(self, netcdf4_file):
Expand Down Expand Up @@ -406,6 +447,26 @@ def test_explicit_filetype(self, netcdf4_file):
with pytest.raises(NotImplementedError):
open_virtual_dataset(netcdf4_file, filetype="grib")

def test_group_kwarg(self, hdf5_groups_file):
with pytest.raises(ValueError, match="Multiple HDF Groups found"):
open_virtual_dataset(hdf5_groups_file)
with pytest.raises(ValueError, match="not found in"):
open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")

vars_to_load = ["air", "time"]
vds = open_virtual_dataset(
hdf5_groups_file,
group="test/group",
loadable_variables=vars_to_load,
indexes={},
)
full_ds = xr.open_dataset(
hdf5_groups_file, group="test/group", decode_times=False
)
for name in full_ds.variables:
if name in vars_to_load:
xrt.assert_identical(vds.variables[name], full_ds.variables[name])
scottyhq marked this conversation as resolved.
Show resolved Hide resolved

@patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
def test_open_virtual_dataset_passes_expected_args(
self, mock_read_kerchunk, netcdf4_file
Expand All @@ -415,6 +476,7 @@ def test_open_virtual_dataset_passes_expected_args(
args = {
"filepath": netcdf4_file,
"filetype": None,
"group": None,
"reader_options": reader_options,
}
mock_read_kerchunk.assert_called_once_with(**args)
Expand Down
Loading
Loading