diff --git a/CHANGELOG.md b/CHANGELOG.md index 1727dad..68f0d9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +0.5.0 (unreleased) +================== + +**Breaking changes** +- Nested group handling: + Before this version, all groups were read, but conflicting variable names in-between groups would shadow data. Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional ``group`` argument to specify which group should be read. When ``group`` is not specified, it defaults to the root group. Additionally ``group`` can be set to ``'*'`` so that every group is read and the hierarchy is flattened. In the event of conflicting variable/dimension names across groups, the conflicting name will be modified by appending ``'__n'`` where n is incremented. + + 0.4.0 (2024-01-08) ================== diff --git a/tests/data/testGroup.xml b/tests/data/testGroup.xml new file mode 100644 index 0000000..00260ac --- /dev/null +++ b/tests/data/testGroup.xml @@ -0,0 +1,16 @@ + + + + 3 + + + + 1 + + + + + 2 + + + diff --git a/tests/data/testGroupConflictingDims.xml b/tests/data/testGroupConflictingDims.xml new file mode 100644 index 0000000..c4cf6eb --- /dev/null +++ b/tests/data/testGroupConflictingDims.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/tests/data/testGroupInvalidDim.xml b/tests/data/testGroupInvalidDim.xml new file mode 100644 index 0000000..5ddce24 --- /dev/null +++ b/tests/data/testGroupInvalidDim.xml @@ -0,0 +1,7 @@ + + + + 3 + + + diff --git a/tests/data/testGroupMultiLayers.xml b/tests/data/testGroupMultiLayers.xml new file mode 100644 index 0000000..4d41694 --- /dev/null +++ b/tests/data/testGroupMultiLayers.xml @@ -0,0 +1,18 @@ + + + + 2 + + + + + + + + + + + + + + diff --git a/tests/test_parser.py b/tests/test_parser.py index ddab016..9676daa 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -310,22 +310,22 @@ def test_unsigned_type(): def test_empty_scalar__no_values_tag(): """ - Scalar without values loose their type because we can't create a typed numpy - scalar which is empty + A scalar variable which is missing will have its value set to + the default value of its type. """ ds = xncml.open_ncml(data / 'testEmptyScalar.xml') - assert ds['empty_scalar_var'].dtype == np.dtype('O') - assert ds['empty_scalar_var'].item() is None + assert ds['empty_scalar_var'].dtype == np.dtype('float64') + assert ds['empty_scalar_var'].item() == 0 def test_empty_scalar__with_empty_values_tag(): - """A scalar variable with an empty tag is invalid.""" + """A scalar with an empty tag is invalid.""" with pytest.raises(ValueError, match='No values found for variable .*'): xncml.open_ncml(data / 'testEmptyScalar_withValuesTag.xml') def test_multiple_values_for_scalar(): - """Scalar with an multiple values in tag is invalid.""" + """A scalar with multiple values in its tag is invalid.""" with pytest.raises(ValueError, match='The expected size for variable .* was 1, .*'): xncml.open_ncml(data / 'testEmptyScalar_withMultipleValues.xml') @@ -343,6 +343,66 @@ def test_empty_attr(): assert ds.attrs['comment'] == '' +def test_read_group__read_only_root_group(): + """By default, only read root group.""" + ds = xncml.open_ncml(data / 'testGroup.xml') + assert ds.toto is not None + assert ds.get('group_var') is None + assert ds.get('other_group_var') is None + + +def test_read_group__read_sub_group(): + """Read specified sub group and its parents.""" + ds = xncml.open_ncml(data / 'testGroup.xml', group='a_sub_group') + assert ds.toto is not None + assert ds.get('group_var') is not None + ds.group_var.attrs['group_path'] = '/a_sub_group' + assert ds.get('other_group_var') is None + + +def test_read_group__conflicting_dims(): + """Read a group and ensure its dimension is correct""" + ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='gr_b') + assert ds.dims['index'] == 94 + assert 'index' in ds.gr_b_var.dims + + +def test_read__invalid_dim(): + with pytest.raises(ValueError, match="Unknown dimension 'myDim'.*"): + xncml.open_ncml(data / 'testGroupInvalidDim.xml') + + +def test_flatten_groups(): + """Read every group and flatten everything in a single dataset/group.""" + ds = xncml.open_ncml(data / 'testGroup.xml', group='*') + assert ds.toto is not None + assert ds.get('toto__1') is None + assert ds.get('group_var') is not None + ds.group_var.attrs['group_path'] = '/a_sub_group' + assert ds.get('other_group_var') is not None + ds.other_group_var.attrs['group_path'] = '/another_sub_group' + + +def test_flatten_groups__conflicting_dims(): + """Read every group and rename dimensions""" + ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='*') + assert 'index' in ds.gr_a_var.dims + assert ds.dims['index'] is not None + assert 'index__1' in ds.gr_b_var.dims + assert ds.dims['index__1'] is not None + + +def test_flatten_groups__sub_groups(): + """Read every group and rename dimensions""" + ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*') + assert ds.dims['index'] == 42 + assert ds.dims['index__1'] == 22 + assert ds['a_var'].size == 1 + assert ds['a_var'] == 2 + assert ds['a_var__1'].size == 42 + assert ds['a_var__2'].size == 22 + + # --- # def check_dimension(ds): assert len(ds['lat']) == 3 diff --git a/xncml/parser.py b/xncml/parser.py index 7bc8f1a..a71f249 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -22,7 +22,6 @@ - - - -- - Support for these attributes is missing: @@ -36,6 +35,7 @@ import datetime as dt from functools import partial from pathlib import Path +from typing import TYPE_CHECKING from warnings import warn import numpy as np @@ -57,10 +57,16 @@ Variable, ) -__author__ = 'David Huard' +if TYPE_CHECKING: + from collections.abc import Iterator + +__author__ = 'David Huard, Abel Aoun' __date__ = 'July 2022' __contact__ = 'huard.david@ouranos.ca' +FLATTEN_GROUPS = '*' +ROOT_GROUP = '/' + def parse(path: Path) -> Netcdf: """ @@ -80,7 +86,7 @@ def parse(path: Path) -> Netcdf: return parser.from_path(path, Netcdf) -def open_ncml(ncml: str | Path) -> xr.Dataset: +def open_ncml(ncml: str | Path, group: str = ROOT_GROUP) -> xr.Dataset: """ Convert NcML document to a dataset. @@ -88,6 +94,10 @@ def open_ncml(ncml: str | Path) -> xr.Dataset: ---------- ncml : str | Path Path to NcML file. + group : str + Path of the group to parse within the ncml. + The special value ``*`` opens every group and flattens the variables into a single + dataset, renaming variables and dimensions if conflicting names are found. Returns ------- @@ -98,10 +108,12 @@ def open_ncml(ncml: str | Path) -> xr.Dataset: ncml = Path(ncml) obj = parse(ncml) - return read_netcdf(xr.Dataset(), xr.Dataset(), obj, ncml) + return read_netcdf(xr.Dataset(), xr.Dataset(), obj, ncml, group) -def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) -> xr.Dataset: +def read_netcdf( + target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path, group: str +) -> xr.Dataset: """ Return content of element. @@ -115,6 +127,10 @@ def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) -> object description. ncml : Path Path to NcML document, sometimes required to follow relative links. + group : str + Path of the group to parse within the ncml. + The special value ``*`` opens every group and flattens the variables into a single + dataset. Returns ------- @@ -133,10 +149,12 @@ def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) -> for item in filter_by_class(obj.choice, Aggregation): target = read_aggregation(target, item, ncml) - - # Handle , and elements - target = read_group(target, ref, obj) - + if group == FLATTEN_GROUPS: + target = _flatten_groups(target, ref, obj) + else: + if not group.startswith('/'): + group = f'/{group}' + target = read_group(target, ref, obj, groups_to_read=[group]) return target @@ -172,7 +190,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat for item in obj.netcdf: # Open dataset defined in 's `location` attribute - tar = read_netcdf(xr.Dataset(), ref=xr.Dataset(), obj=item, ncml=ncml) + tar = read_netcdf(xr.Dataset(), ref=xr.Dataset(), obj=item, ncml=ncml, group=ROOT_GROUP) closers.append(getattr(tar, '_close')) # Select variables @@ -209,7 +227,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat else: raise NotImplementedError - agg = read_group(agg, None, obj) + agg = read_group(agg, ref=None, obj=obj, groups_to_read=[ROOT_GROUP]) out = target.merge(agg, combine_attrs='no_conflicts') out.set_close(partial(_multi_file_closer, closers)) return out @@ -243,8 +261,31 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset: return xr.open_dataset(location, decode_times=False) +def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[str]: + group_children = [child for child in group.choice if isinstance(child, Group)] + current_path = ROOT_GROUP if parent is None else f'{parent}{group.name}/' + if len(group_children) == 0: + yield current_path + for child in group_children: + yield from _get_leaves(child, parent=current_path) + + +def _flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset: + dims = {} + enums = {} + leaves_group = list(_get_leaves(root_group)) + read_group(target, ref, root_group, groups_to_read=leaves_group, dims=dims, enums=enums) + return target + + def read_group( - target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf, dims: dict = None + target: xr.Dataset, + ref: xr.Dataset | None, + obj: Group | Netcdf, + groups_to_read: list[str], + parent_group_path: str = ROOT_GROUP, + dims: dict = None, + enums: dict = None, ) -> xr.Dataset: """ Parse items, typically , , and elements. @@ -253,10 +294,16 @@ def read_group( ---------- target : xr.Dataset Target dataset to be updated. - ref : xr.Dataset + ref : xr.Dataset | None Reference dataset used to copy content into `target`. obj : Group | Netcdf - object description. + object description. + groups_to_read : list[str] + List of groups that must be read and included in `target`. + parent_group_path : str + Path of parent group, by default the root group '/'. + dims: dict[str, Dimension] + Dictionary of the dimensions of this dataset. Returns ------- @@ -264,12 +311,16 @@ def read_group( Dataset holding variables and attributes defined in element. """ dims = {} if dims is None else dims - enums = {} + enums = {} if enums is None else enums for item in obj.choice: if isinstance(item, Dimension): - dims[item.name] = read_dimension(item) + dim_name = item.name + if dims.get(dim_name): + dims[dim_name].append(read_dimension(item)) + else: + dims[dim_name] = [read_dimension(item)] elif isinstance(item, Variable): - target = read_variable(target, ref, item, dims, enums) + target = read_variable(target, ref, item, dims, enums, group_path=parent_group_path) elif isinstance(item, Attribute): read_attribute(target, item, ref) elif isinstance(item, Remove): @@ -277,12 +328,22 @@ def read_group( elif isinstance(item, EnumTypedef): enums[item.name] = read_enum(item) elif isinstance(item, Group): - target = read_group(target, ref, item, dims) + if any(item.name in group_name for group_name in groups_to_read): + target = read_group( + target, + ref, + item, + parent_group_path=f'{parent_group_path}{item.name}/', + dims=dims, + groups_to_read=groups_to_read, + ) + else: + # ignore group + continue elif isinstance(item, Aggregation): pass # elements are parsed in `read_netcdf` else: raise AttributeError - return target @@ -407,8 +468,13 @@ def read_enum(obj: EnumTypedef) -> dict[str, list]: def read_variable( - target: xr.Dataset, ref: xr.Dataset, obj: Variable, dimensions: dict, enums: dict -): + target: xr.Dataset, + ref: xr.Dataset, + obj: Variable, + dimensions: dict, + enums: dict, + group_path: str, +) -> xr.Dataset: """ Parse element. @@ -423,6 +489,9 @@ def read_variable( dimensions : dict Dimension attributes keyed by name. enums: dict[str, dict] + The enums types that have been read in the parent groups. + group_path: str + Path to the parent group. Returns ------- @@ -441,30 +510,47 @@ def read_variable( else: ref_var = None + var_name = obj.name # Read existing data or create empty DataArray - if (obj.name in target) or (obj.name in target.dims): - out = xr.as_variable(target[obj.name]) + if (existing_var := target.get(var_name)) is not None and existing_var.attrs.get( + 'group_path' + ) in [None, group_path]: + out = xr.as_variable(target[var_name]) if obj.type: out = out.astype(nctype(obj.type)) ref_var = None - elif (obj.name in ref) or (obj.name in ref.dims): - out = xr.as_variable(ref[obj.name]) + elif (existing_var := ref.get(var_name)) is not None and existing_var.attrs.get( + 'group_path' + ) in [None, group_path]: + out = xr.as_variable(ref[var_name]) if obj.type: out = out.astype(nctype(obj.type)) - ref_var = ref[obj.name] + ref_var = ref[var_name] elif obj.shape: - dims = obj.shape.split(' ') - shape = [dimensions[dim].length for dim in dims] - out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=dims) + var_dims = [] + shape = [] + for dim in obj.shape.split(' '): + if dimensions.get(dim) is None: + err = ( + f"Unknown dimension '{dim}'." + ' Make sure it is declared before being used in the NCML.' + ) + raise ValueError(err) + shape.append(dimensions[dim][-1].length) + if (dim_count := len(dimensions[dim])) > 1: + dim = f'{dim}__{dim_count - 1}' + var_dims.append(dim) + out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=var_dims) elif obj.shape == '': - out = build_scalar_variable(var_name=obj.name, values_tag=obj.values, var_type=obj.type) + out = build_scalar_variable(var_name=var_name, values_tag=obj.values, var_type=obj.type) else: - error_msg = f'Could not build variable `{obj.name}`.' + error_msg = f'Could not build variable `{var_name }`.' raise ValueError(error_msg) # Set variable attributes for item in obj.attribute: read_attribute(out, item, ref=ref_var) + out.attrs['group_path'] = group_path # Remove attributes or dimensions for item in obj.remove: @@ -472,7 +558,7 @@ def read_variable( # Read values for arrays (already done for a scalar) if obj.values and obj.shape != '': - data = read_values(obj.name, out.size, obj.values) + data = read_values(var_name, out.size, obj.values) data = out.dtype.type(data) out = xr.Variable( out.dims, @@ -490,14 +576,23 @@ def read_variable( raise NotImplementedError if obj.typedef in enums.keys(): - # TODO: Also update encoding when https://github.com/pydata/xarray/pull/8147 - # is merged in xarray. + # TODO (@bzah): Update this once Enums are merged in xarray + # https://github.com/pydata/xarray/pull/8147 out.attrs['flag_values'] = enums[obj.typedef]['flag_values'] out.attrs['flag_meanings'] = enums[obj.typedef]['flag_meanings'] elif obj.typedef is not None: raise NotImplementedError + import re - target[obj.name] = out + reg = re.compile(f'^{var_name}__|{var_name}') + similar_vars_but_diff_path = [ + v + for v in target.data_vars + if reg.match(v) and target[v].attrs.get('group_path') not in [None, group_path] + ] + if len(similar_vars_but_diff_path) > 0: + var_name = f'{var_name}__{len(similar_vars_but_diff_path)}' + target[var_name] = out return target @@ -521,7 +616,8 @@ def read_values(var_name: str, expected_size: int, values_tag: Values) -> list: if values_tag.from_attribute is not None: error_msg = ( 'xncml cannot yet fetch values from a global or a ' - f' variable attribute using , here on variable {var_name}.' + ' variable attribute using , here on variable' + f' {var_name}.' ) raise NotImplementedError(error_msg) if values_tag.start is not None and values_tag.increment is not None: @@ -551,7 +647,7 @@ def read_values(var_name: str, expected_size: int, values_tag: Values) -> list: def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> xr.Variable: - """Read values for element. + """Build an xr.Variable for scalar variables. Parameters ---------- @@ -573,12 +669,13 @@ def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> x If the tag is not a valid scalar. """ if values_tag is None: + default_value = nctype(var_type)() warn( - f'Could not set the type for the scalar variable {var_name}, as its' - ' is empty. Provide a single values within ' + f'The scalar variable {var_name} has no values set within' + f' . A default value of {default_value} is set' ' to preserve the type.' ) - return xr.Variable(data=None, dims=()) + return xr.Variable(data=default_value, dims=()) values_content = read_values(var_name, expected_size=1, values_tag=values_tag) if len(values_content) == 1: return xr.Variable(data=np.array(values_content[0], dtype=nctype(var_type))[()], dims=())