diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1727dad..68f0d9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,11 @@
+0.5.0 (unreleased)
+==================
+
+**Breaking changes**
+- Nested group handling:
+ Before this version, all groups were read, but conflicting variable names in-between groups would shadow data. Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional ``group`` argument to specify which group should be read. When ``group`` is not specified, it defaults to the root group. Additionally ``group`` can be set to ``'*'`` so that every group is read and the hierarchy is flattened. In the event of conflicting variable/dimension names across groups, the conflicting name will be modified by appending ``'__n'`` where n is incremented.
+
+
0.4.0 (2024-01-08)
==================
diff --git a/tests/data/testGroup.xml b/tests/data/testGroup.xml
new file mode 100644
index 0000000..00260ac
--- /dev/null
+++ b/tests/data/testGroup.xml
@@ -0,0 +1,16 @@
+
+
+
+ 3
+
+
+
+ 1
+
+
+
+
+ 2
+
+
+
diff --git a/tests/data/testGroupConflictingDims.xml b/tests/data/testGroupConflictingDims.xml
new file mode 100644
index 0000000..c4cf6eb
--- /dev/null
+++ b/tests/data/testGroupConflictingDims.xml
@@ -0,0 +1,11 @@
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/data/testGroupInvalidDim.xml b/tests/data/testGroupInvalidDim.xml
new file mode 100644
index 0000000..5ddce24
--- /dev/null
+++ b/tests/data/testGroupInvalidDim.xml
@@ -0,0 +1,7 @@
+
+
+
+ 3
+
+
+
diff --git a/tests/data/testGroupMultiLayers.xml b/tests/data/testGroupMultiLayers.xml
new file mode 100644
index 0000000..4d41694
--- /dev/null
+++ b/tests/data/testGroupMultiLayers.xml
@@ -0,0 +1,18 @@
+
+
+
+ 2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_parser.py b/tests/test_parser.py
index ddab016..9676daa 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -310,22 +310,22 @@ def test_unsigned_type():
def test_empty_scalar__no_values_tag():
"""
- Scalar without values loose their type because we can't create a typed numpy
- scalar which is empty
+ A scalar variable which is missing will have its value set to
+ the default value of its type.
"""
ds = xncml.open_ncml(data / 'testEmptyScalar.xml')
- assert ds['empty_scalar_var'].dtype == np.dtype('O')
- assert ds['empty_scalar_var'].item() is None
+ assert ds['empty_scalar_var'].dtype == np.dtype('float64')
+ assert ds['empty_scalar_var'].item() == 0
def test_empty_scalar__with_empty_values_tag():
- """A scalar variable with an empty tag is invalid."""
+ """A scalar with an empty tag is invalid."""
with pytest.raises(ValueError, match='No values found for variable .*'):
xncml.open_ncml(data / 'testEmptyScalar_withValuesTag.xml')
def test_multiple_values_for_scalar():
- """Scalar with an multiple values in tag is invalid."""
+ """A scalar with multiple values in its tag is invalid."""
with pytest.raises(ValueError, match='The expected size for variable .* was 1, .*'):
xncml.open_ncml(data / 'testEmptyScalar_withMultipleValues.xml')
@@ -343,6 +343,66 @@ def test_empty_attr():
assert ds.attrs['comment'] == ''
+def test_read_group__read_only_root_group():
+ """By default, only read root group."""
+ ds = xncml.open_ncml(data / 'testGroup.xml')
+ assert ds.toto is not None
+ assert ds.get('group_var') is None
+ assert ds.get('other_group_var') is None
+
+
+def test_read_group__read_sub_group():
+ """Read specified sub group and its parents."""
+ ds = xncml.open_ncml(data / 'testGroup.xml', group='a_sub_group')
+ assert ds.toto is not None
+ assert ds.get('group_var') is not None
+ ds.group_var.attrs['group_path'] = '/a_sub_group'
+ assert ds.get('other_group_var') is None
+
+
+def test_read_group__conflicting_dims():
+ """Read a group and ensure its dimension is correct"""
+ ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='gr_b')
+ assert ds.dims['index'] == 94
+ assert 'index' in ds.gr_b_var.dims
+
+
+def test_read__invalid_dim():
+ with pytest.raises(ValueError, match="Unknown dimension 'myDim'.*"):
+ xncml.open_ncml(data / 'testGroupInvalidDim.xml')
+
+
+def test_flatten_groups():
+ """Read every group and flatten everything in a single dataset/group."""
+ ds = xncml.open_ncml(data / 'testGroup.xml', group='*')
+ assert ds.toto is not None
+ assert ds.get('toto__1') is None
+ assert ds.get('group_var') is not None
+ ds.group_var.attrs['group_path'] = '/a_sub_group'
+ assert ds.get('other_group_var') is not None
+ ds.other_group_var.attrs['group_path'] = '/another_sub_group'
+
+
+def test_flatten_groups__conflicting_dims():
+ """Read every group and rename dimensions"""
+ ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='*')
+ assert 'index' in ds.gr_a_var.dims
+ assert ds.dims['index'] is not None
+ assert 'index__1' in ds.gr_b_var.dims
+ assert ds.dims['index__1'] is not None
+
+
+def test_flatten_groups__sub_groups():
+ """Read every group and rename dimensions"""
+ ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*')
+ assert ds.dims['index'] == 42
+ assert ds.dims['index__1'] == 22
+ assert ds['a_var'].size == 1
+ assert ds['a_var'] == 2
+ assert ds['a_var__1'].size == 42
+ assert ds['a_var__2'].size == 22
+
+
# --- #
def check_dimension(ds):
assert len(ds['lat']) == 3
diff --git a/xncml/parser.py b/xncml/parser.py
index 7bc8f1a..a71f249 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -22,7 +22,6 @@
-
-
-
--
-
Support for these attributes is missing:
@@ -36,6 +35,7 @@
import datetime as dt
from functools import partial
from pathlib import Path
+from typing import TYPE_CHECKING
from warnings import warn
import numpy as np
@@ -57,10 +57,16 @@
Variable,
)
-__author__ = 'David Huard'
+if TYPE_CHECKING:
+ from collections.abc import Iterator
+
+__author__ = 'David Huard, Abel Aoun'
__date__ = 'July 2022'
__contact__ = 'huard.david@ouranos.ca'
+FLATTEN_GROUPS = '*'
+ROOT_GROUP = '/'
+
def parse(path: Path) -> Netcdf:
"""
@@ -80,7 +86,7 @@ def parse(path: Path) -> Netcdf:
return parser.from_path(path, Netcdf)
-def open_ncml(ncml: str | Path) -> xr.Dataset:
+def open_ncml(ncml: str | Path, group: str = ROOT_GROUP) -> xr.Dataset:
"""
Convert NcML document to a dataset.
@@ -88,6 +94,10 @@ def open_ncml(ncml: str | Path) -> xr.Dataset:
----------
ncml : str | Path
Path to NcML file.
+ group : str
+ Path of the group to parse within the ncml.
+ The special value ``*`` opens every group and flattens the variables into a single
+ dataset, renaming variables and dimensions if conflicting names are found.
Returns
-------
@@ -98,10 +108,12 @@ def open_ncml(ncml: str | Path) -> xr.Dataset:
ncml = Path(ncml)
obj = parse(ncml)
- return read_netcdf(xr.Dataset(), xr.Dataset(), obj, ncml)
+ return read_netcdf(xr.Dataset(), xr.Dataset(), obj, ncml, group)
-def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) -> xr.Dataset:
+def read_netcdf(
+ target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path, group: str
+) -> xr.Dataset:
"""
Return content of element.
@@ -115,6 +127,10 @@ def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) ->
object description.
ncml : Path
Path to NcML document, sometimes required to follow relative links.
+ group : str
+ Path of the group to parse within the ncml.
+ The special value ``*`` opens every group and flattens the variables into a single
+ dataset.
Returns
-------
@@ -133,10 +149,12 @@ def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) ->
for item in filter_by_class(obj.choice, Aggregation):
target = read_aggregation(target, item, ncml)
-
- # Handle , and elements
- target = read_group(target, ref, obj)
-
+ if group == FLATTEN_GROUPS:
+ target = _flatten_groups(target, ref, obj)
+ else:
+ if not group.startswith('/'):
+ group = f'/{group}'
+ target = read_group(target, ref, obj, groups_to_read=[group])
return target
@@ -172,7 +190,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat
for item in obj.netcdf:
# Open dataset defined in 's `location` attribute
- tar = read_netcdf(xr.Dataset(), ref=xr.Dataset(), obj=item, ncml=ncml)
+ tar = read_netcdf(xr.Dataset(), ref=xr.Dataset(), obj=item, ncml=ncml, group=ROOT_GROUP)
closers.append(getattr(tar, '_close'))
# Select variables
@@ -209,7 +227,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat
else:
raise NotImplementedError
- agg = read_group(agg, None, obj)
+ agg = read_group(agg, ref=None, obj=obj, groups_to_read=[ROOT_GROUP])
out = target.merge(agg, combine_attrs='no_conflicts')
out.set_close(partial(_multi_file_closer, closers))
return out
@@ -243,8 +261,31 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset:
return xr.open_dataset(location, decode_times=False)
+def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[str]:
+ group_children = [child for child in group.choice if isinstance(child, Group)]
+ current_path = ROOT_GROUP if parent is None else f'{parent}{group.name}/'
+ if len(group_children) == 0:
+ yield current_path
+ for child in group_children:
+ yield from _get_leaves(child, parent=current_path)
+
+
+def _flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset:
+ dims = {}
+ enums = {}
+ leaves_group = list(_get_leaves(root_group))
+ read_group(target, ref, root_group, groups_to_read=leaves_group, dims=dims, enums=enums)
+ return target
+
+
def read_group(
- target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf, dims: dict = None
+ target: xr.Dataset,
+ ref: xr.Dataset | None,
+ obj: Group | Netcdf,
+ groups_to_read: list[str],
+ parent_group_path: str = ROOT_GROUP,
+ dims: dict = None,
+ enums: dict = None,
) -> xr.Dataset:
"""
Parse items, typically , , and elements.
@@ -253,10 +294,16 @@ def read_group(
----------
target : xr.Dataset
Target dataset to be updated.
- ref : xr.Dataset
+ ref : xr.Dataset | None
Reference dataset used to copy content into `target`.
obj : Group | Netcdf
- object description.
+ object description.
+ groups_to_read : list[str]
+ List of groups that must be read and included in `target`.
+ parent_group_path : str
+ Path of parent group, by default the root group '/'.
+ dims: dict[str, Dimension]
+ Dictionary of the dimensions of this dataset.
Returns
-------
@@ -264,12 +311,16 @@ def read_group(
Dataset holding variables and attributes defined in element.
"""
dims = {} if dims is None else dims
- enums = {}
+ enums = {} if enums is None else enums
for item in obj.choice:
if isinstance(item, Dimension):
- dims[item.name] = read_dimension(item)
+ dim_name = item.name
+ if dims.get(dim_name):
+ dims[dim_name].append(read_dimension(item))
+ else:
+ dims[dim_name] = [read_dimension(item)]
elif isinstance(item, Variable):
- target = read_variable(target, ref, item, dims, enums)
+ target = read_variable(target, ref, item, dims, enums, group_path=parent_group_path)
elif isinstance(item, Attribute):
read_attribute(target, item, ref)
elif isinstance(item, Remove):
@@ -277,12 +328,22 @@ def read_group(
elif isinstance(item, EnumTypedef):
enums[item.name] = read_enum(item)
elif isinstance(item, Group):
- target = read_group(target, ref, item, dims)
+ if any(item.name in group_name for group_name in groups_to_read):
+ target = read_group(
+ target,
+ ref,
+ item,
+ parent_group_path=f'{parent_group_path}{item.name}/',
+ dims=dims,
+ groups_to_read=groups_to_read,
+ )
+ else:
+ # ignore group
+ continue
elif isinstance(item, Aggregation):
pass # elements are parsed in `read_netcdf`
else:
raise AttributeError
-
return target
@@ -407,8 +468,13 @@ def read_enum(obj: EnumTypedef) -> dict[str, list]:
def read_variable(
- target: xr.Dataset, ref: xr.Dataset, obj: Variable, dimensions: dict, enums: dict
-):
+ target: xr.Dataset,
+ ref: xr.Dataset,
+ obj: Variable,
+ dimensions: dict,
+ enums: dict,
+ group_path: str,
+) -> xr.Dataset:
"""
Parse element.
@@ -423,6 +489,9 @@ def read_variable(
dimensions : dict
Dimension attributes keyed by name.
enums: dict[str, dict]
+ The enums types that have been read in the parent groups.
+ group_path: str
+ Path to the parent group.
Returns
-------
@@ -441,30 +510,47 @@ def read_variable(
else:
ref_var = None
+ var_name = obj.name
# Read existing data or create empty DataArray
- if (obj.name in target) or (obj.name in target.dims):
- out = xr.as_variable(target[obj.name])
+ if (existing_var := target.get(var_name)) is not None and existing_var.attrs.get(
+ 'group_path'
+ ) in [None, group_path]:
+ out = xr.as_variable(target[var_name])
if obj.type:
out = out.astype(nctype(obj.type))
ref_var = None
- elif (obj.name in ref) or (obj.name in ref.dims):
- out = xr.as_variable(ref[obj.name])
+ elif (existing_var := ref.get(var_name)) is not None and existing_var.attrs.get(
+ 'group_path'
+ ) in [None, group_path]:
+ out = xr.as_variable(ref[var_name])
if obj.type:
out = out.astype(nctype(obj.type))
- ref_var = ref[obj.name]
+ ref_var = ref[var_name]
elif obj.shape:
- dims = obj.shape.split(' ')
- shape = [dimensions[dim].length for dim in dims]
- out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=dims)
+ var_dims = []
+ shape = []
+ for dim in obj.shape.split(' '):
+ if dimensions.get(dim) is None:
+ err = (
+ f"Unknown dimension '{dim}'."
+ ' Make sure it is declared before being used in the NCML.'
+ )
+ raise ValueError(err)
+ shape.append(dimensions[dim][-1].length)
+ if (dim_count := len(dimensions[dim])) > 1:
+ dim = f'{dim}__{dim_count - 1}'
+ var_dims.append(dim)
+ out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=var_dims)
elif obj.shape == '':
- out = build_scalar_variable(var_name=obj.name, values_tag=obj.values, var_type=obj.type)
+ out = build_scalar_variable(var_name=var_name, values_tag=obj.values, var_type=obj.type)
else:
- error_msg = f'Could not build variable `{obj.name}`.'
+ error_msg = f'Could not build variable `{var_name }`.'
raise ValueError(error_msg)
# Set variable attributes
for item in obj.attribute:
read_attribute(out, item, ref=ref_var)
+ out.attrs['group_path'] = group_path
# Remove attributes or dimensions
for item in obj.remove:
@@ -472,7 +558,7 @@ def read_variable(
# Read values for arrays (already done for a scalar)
if obj.values and obj.shape != '':
- data = read_values(obj.name, out.size, obj.values)
+ data = read_values(var_name, out.size, obj.values)
data = out.dtype.type(data)
out = xr.Variable(
out.dims,
@@ -490,14 +576,23 @@ def read_variable(
raise NotImplementedError
if obj.typedef in enums.keys():
- # TODO: Also update encoding when https://github.com/pydata/xarray/pull/8147
- # is merged in xarray.
+ # TODO (@bzah): Update this once Enums are merged in xarray
+ # https://github.com/pydata/xarray/pull/8147
out.attrs['flag_values'] = enums[obj.typedef]['flag_values']
out.attrs['flag_meanings'] = enums[obj.typedef]['flag_meanings']
elif obj.typedef is not None:
raise NotImplementedError
+ import re
- target[obj.name] = out
+ reg = re.compile(f'^{var_name}__|{var_name}')
+ similar_vars_but_diff_path = [
+ v
+ for v in target.data_vars
+ if reg.match(v) and target[v].attrs.get('group_path') not in [None, group_path]
+ ]
+ if len(similar_vars_but_diff_path) > 0:
+ var_name = f'{var_name}__{len(similar_vars_but_diff_path)}'
+ target[var_name] = out
return target
@@ -521,7 +616,8 @@ def read_values(var_name: str, expected_size: int, values_tag: Values) -> list:
if values_tag.from_attribute is not None:
error_msg = (
'xncml cannot yet fetch values from a global or a '
- f' variable attribute using , here on variable {var_name}.'
+ ' variable attribute using , here on variable'
+ f' {var_name}.'
)
raise NotImplementedError(error_msg)
if values_tag.start is not None and values_tag.increment is not None:
@@ -551,7 +647,7 @@ def read_values(var_name: str, expected_size: int, values_tag: Values) -> list:
def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> xr.Variable:
- """Read values for element.
+ """Build an xr.Variable for scalar variables.
Parameters
----------
@@ -573,12 +669,13 @@ def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> x
If the tag is not a valid scalar.
"""
if values_tag is None:
+ default_value = nctype(var_type)()
warn(
- f'Could not set the type for the scalar variable {var_name}, as its'
- ' is empty. Provide a single values within '
+ f'The scalar variable {var_name} has no values set within'
+ f' . A default value of {default_value} is set'
' to preserve the type.'
)
- return xr.Variable(data=None, dims=())
+ return xr.Variable(data=default_value, dims=())
values_content = read_values(var_name, expected_size=1, values_tag=values_tag)
if len(values_content) == 1:
return xr.Variable(data=np.array(values_content[0], dtype=nctype(var_type))[()], dims=())