From d5df9153351f3f9b09499532666023a14073eb49 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Thu, 11 Jan 2024 20:01:12 +0100 Subject: [PATCH 01/12] FIX: pre-commit hook for black --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d2cbbc2..0a36b3e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,7 +13,7 @@ repos: - id: check-yaml - id: double-quote-string-fixer - - repo: https://github.com/psf/black + - repo: https://github.com/psf/black-pre-commit-mirror rev: 23.12.1 hooks: - id: black From be6acbc66e1454e301312fef9ed5551251555caa Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Fri, 12 Jan 2024 14:35:13 +0100 Subject: [PATCH 02/12] ENH: Add group filtering to open_ncml --- tests/data/testGroup.xml | 11 +++++++ tests/test_parser.py | 35 +++++++++++++++++--- xncml/parser.py | 70 +++++++++++++++++++++++++++++++++------- 3 files changed, 100 insertions(+), 16 deletions(-) create mode 100644 tests/data/testGroup.xml diff --git a/tests/data/testGroup.xml b/tests/data/testGroup.xml new file mode 100644 index 0000000..b798b4d --- /dev/null +++ b/tests/data/testGroup.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/tests/test_parser.py b/tests/test_parser.py index ddab016..4a1f259 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -310,8 +310,8 @@ def test_unsigned_type(): def test_empty_scalar__no_values_tag(): """ - Scalar without values loose their type because we can't create a typed numpy - scalar which is empty + A scalar without a tag will not be typed, even if the 'type' attribute is + filled, because numpy can't create an empty typed scalar. """ ds = xncml.open_ncml(data / 'testEmptyScalar.xml') assert ds['empty_scalar_var'].dtype == np.dtype('O') @@ -319,13 +319,13 @@ def test_empty_scalar__no_values_tag(): def test_empty_scalar__with_empty_values_tag(): - """A scalar variable with an empty tag is invalid.""" + """A scalar with an empty in its tag is invalid.""" with pytest.raises(ValueError, match='No values found for variable .*'): xncml.open_ncml(data / 'testEmptyScalar_withValuesTag.xml') def test_multiple_values_for_scalar(): - """Scalar with an multiple values in tag is invalid.""" + """a scalar with multiple values in its tag is invalid.""" with pytest.raises(ValueError, match='The expected size for variable .* was 1, .*'): xncml.open_ncml(data / 'testEmptyScalar_withMultipleValues.xml') @@ -343,6 +343,33 @@ def test_empty_attr(): assert ds.attrs['comment'] == '' +def test_read_group__read_only_root_group(): + """By default, only read root group.""" + ds = xncml.open_ncml(data / 'testGroup.xml') + assert ds.toto is not None + assert ds.get('group_var') is None + assert ds.get('other_group_var') is None + + +def test_read_group__read_sub_group(): + """Read specified sub group and its parents.""" + ds = xncml.open_ncml(data / 'testGroup.xml', group='a_sub_group') + assert ds.toto is not None + assert ds.get('group_var') is not None + ds.group_var.attrs['group_path'] = '/a_sub_group' + assert ds.get('other_group_var') is None + + +def test_flatten_groups(): + """Read every group and flatten everything in a single dataset/group.""" + ds = xncml.open_ncml(data / 'testGroup.xml', group='*') + assert ds.toto is not None + assert ds.get('group_var') is not None + ds.group_var.attrs['group_path'] = '/a_sub_group' + assert ds.get('other_group_var') is not None + ds.other_group_var.attrs['group_path'] = '/another_sub_group' + + # --- # def check_dimension(ds): assert len(ds['lat']) == 3 diff --git a/xncml/parser.py b/xncml/parser.py index ef80640..fc3ca0e 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -22,7 +22,6 @@ - - - -- - Support for these attributes is missing: @@ -62,6 +61,9 @@ __date__ = 'July 2022' __contact__ = 'huard.david@ouranos.ca' +FLATTEN_GROUPS = '*' +ROOT_GROUP = '/' + def parse(path: Path) -> Netcdf: """ @@ -81,7 +83,7 @@ def parse(path: Path) -> Netcdf: return parser.from_path(path, Netcdf) -def open_ncml(ncml: str | Path) -> xr.Dataset: +def open_ncml(ncml: str | Path, group: str = '/') -> xr.Dataset: """ Convert NcML document to a dataset. @@ -89,6 +91,10 @@ def open_ncml(ncml: str | Path) -> xr.Dataset: ---------- ncml : str | Path Path to NcML file. + group : str + Path of the group to parse within the ncml. + The special value ``*`` opens every group and flatten the variables into a single + dataset. Returns ------- @@ -99,10 +105,12 @@ def open_ncml(ncml: str | Path) -> xr.Dataset: ncml = Path(ncml) obj = parse(ncml) - return read_netcdf(xr.Dataset(), xr.Dataset(), obj, ncml) + return read_netcdf(xr.Dataset(), xr.Dataset(), obj, ncml, group) -def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) -> xr.Dataset: +def read_netcdf( + target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path, group: str +) -> xr.Dataset: """ Return content of element. @@ -116,6 +124,10 @@ def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) -> object description. ncml : Path Path to NcML document, sometimes required to follow relative links. + group : str + Path of the group to parse within the ncml. + The special value ``*`` opens every group and flatten the variables into a single + dataset. Returns ------- @@ -136,8 +148,10 @@ def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) -> target = read_aggregation(target, item, ncml) # Handle , and elements - target = read_group(target, ref, obj) - + if group == FLATTEN_GROUPS: + target = flatten_groups(target, ref, obj) + else: + target = read_group(target, ref, obj, group) return target @@ -173,7 +187,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat for item in obj.netcdf: # Open dataset defined in 's `location` attribute - tar = read_netcdf(xr.Dataset(), ref=xr.Dataset(), obj=item, ncml=ncml) + tar = read_netcdf(xr.Dataset(), ref=xr.Dataset(), obj=item, ncml=ncml, group=ROOT_GROUP) closers.append(getattr(tar, '_close')) # Select variables @@ -210,7 +224,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat else: raise NotImplementedError - agg = read_group(agg, None, obj) + agg = read_group(agg, None, obj, group=ROOT_GROUP) out = target.merge(agg, combine_attrs='no_conflicts') out.set_close(partial(_multi_file_closer, closers)) return out @@ -244,8 +258,13 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset: return xr.open_dataset(location, decode_times=False) +def flatten_groups(target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf): + """TODO doc""" + raise NotImplementedError('TODO') + + def read_group( - target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf, dims: dict = None + target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf, group: str, dims: dict = None ) -> xr.Dataset: """ Parse items, typically , , and elements. @@ -258,6 +277,10 @@ def read_group( Reference dataset used to copy content into `target`. obj : Group | Netcdf object description. + group : str + Path of the group to parse within the ncml. + The special value ``*`` opens every group and flatten the variables into a single + dataset. Returns ------- @@ -266,11 +289,18 @@ def read_group( """ dims = {} if dims is None else dims enums = {} + if not group.startswith('/'): + group = f'/{group}' + prefix, _, child_path = group.partition('/') + if group == ROOT_GROUP or prefix == '': + current_group = ROOT_GROUP + else: + current_group = prefix for item in obj.choice: if isinstance(item, Dimension): dims[item.name] = read_dimension(item) elif isinstance(item, Variable): - target = read_variable(target, ref, item, dims, enums) + target = read_variable(target, ref, item, dims, enums, group_path=group) elif isinstance(item, Attribute): read_attribute(target, item, ref) elif isinstance(item, Remove): @@ -278,7 +308,11 @@ def read_group( elif isinstance(item, EnumTypedef): enums[item.name] = read_enum(item) elif isinstance(item, Group): - target = read_group(target, ref, item, dims) + if item.name in child_path: + target = read_group(target, ref, item, current_group, dims) + else: + # ignore group + continue elif isinstance(item, Aggregation): pass # elements are parsed in `read_netcdf` else: @@ -408,7 +442,12 @@ def read_enum(obj: EnumTypedef) -> dict[str, list]: def read_variable( - target: xr.Dataset, ref: xr.Dataset, obj: Variable, dimensions: dict, enums: dict + target: xr.Dataset, + ref: xr.Dataset, + obj: Variable, + dimensions: dict, + enums: dict, + group_path: str, ): """ Parse element. @@ -424,6 +463,9 @@ def read_variable( dimensions : dict Dimension attributes keyed by name. enums: dict[str, dict] + The enums types that have been read in the upper groups. + group_path: str + Full path to the parent group Returns ------- @@ -466,6 +508,10 @@ def read_variable( # Set variable attributes for item in obj.attribute: read_attribute(out, item, ref=ref_var) + out.attrs['group_path'] = group_path + # TODO (@bzah): Maybe rename the variable using the parent full path + # if we are flattening, this would avoid overriding an existing + # variable of target. # Remove attributes or dimensions for item in obj.remove: From d96544322951cc6ba407abb019e46f5a63e9e633 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Fri, 12 Jan 2024 20:08:13 +0100 Subject: [PATCH 03/12] ENH: Add flatten capabilities to read nested groups When open_ncml is called with group="*", every group will be read and they will be flatten in the resulting dataset. If names are conflicting, the dimensions and varaibles names are appended with __n where n is the number of existing simlar names. --- tests/data/testGroup.xml | 9 +- tests/data/testGroupConflictingDims.xml | 11 ++ tests/data/testGroupMultiLayers.xml | 9 ++ tests/test_parser.py | 23 ++++ xncml/parser.py | 138 +++++++++++++++++------- 5 files changed, 147 insertions(+), 43 deletions(-) create mode 100644 tests/data/testGroupConflictingDims.xml create mode 100644 tests/data/testGroupMultiLayers.xml diff --git a/tests/data/testGroup.xml b/tests/data/testGroup.xml index b798b4d..00260ac 100644 --- a/tests/data/testGroup.xml +++ b/tests/data/testGroup.xml @@ -1,11 +1,16 @@ + 3 - + + 1 + - + + 2 + diff --git a/tests/data/testGroupConflictingDims.xml b/tests/data/testGroupConflictingDims.xml new file mode 100644 index 0000000..c4cf6eb --- /dev/null +++ b/tests/data/testGroupConflictingDims.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/tests/data/testGroupMultiLayers.xml b/tests/data/testGroupMultiLayers.xml new file mode 100644 index 0000000..90b35eb --- /dev/null +++ b/tests/data/testGroupMultiLayers.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/tests/test_parser.py b/tests/test_parser.py index 4a1f259..a0119bf 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -360,16 +360,39 @@ def test_read_group__read_sub_group(): assert ds.get('other_group_var') is None +def test_read_group__conflicting_dims(): + """Read a group and ensure its dimension is correct""" + ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='gr_b') + assert ds.dims['index'] == 94 + assert 'index' in ds.gr_b_var.dims + + def test_flatten_groups(): """Read every group and flatten everything in a single dataset/group.""" ds = xncml.open_ncml(data / 'testGroup.xml', group='*') assert ds.toto is not None + assert ds.get('toto__1') is None assert ds.get('group_var') is not None ds.group_var.attrs['group_path'] = '/a_sub_group' assert ds.get('other_group_var') is not None ds.other_group_var.attrs['group_path'] = '/another_sub_group' +def test_flatten_groups__conflicting_dims(): + """Read every group and rename dimensions""" + ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='*') + assert 'index' in ds.gr_a_var.dims + assert ds.dims['index'] is not None + assert 'index__1' in ds.gr_b_var.dims + assert ds.dims['index__1'] is not None + + +def test_flatten_groups__sub_groups(): + """Read every group and rename dimensions""" + ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*') + assert ds.dims['index'] is not None + + # --- # def check_dimension(ds): assert len(ds['lat']) == 3 diff --git a/xncml/parser.py b/xncml/parser.py index fc3ca0e..dc3d144 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -94,7 +94,7 @@ def open_ncml(ncml: str | Path, group: str = '/') -> xr.Dataset: group : str Path of the group to parse within the ncml. The special value ``*`` opens every group and flatten the variables into a single - dataset. + dataset, renaming variables and dimensions if conflicting names are found. Returns ------- @@ -146,12 +146,12 @@ def read_netcdf( for item in filter_by_class(obj.choice, Aggregation): target = read_aggregation(target, item, ncml) - - # Handle , and elements if group == FLATTEN_GROUPS: target = flatten_groups(target, ref, obj) else: - target = read_group(target, ref, obj, group) + if not group.startswith('/'): + group = f'/{group}' + target = read_group(target, ref, obj, group_to_read=group) return target @@ -224,7 +224,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat else: raise NotImplementedError - agg = read_group(agg, None, obj, group=ROOT_GROUP) + agg = read_group(agg, ref=None, obj=obj) out = target.merge(agg, combine_attrs='no_conflicts') out.set_close(partial(_multi_file_closer, closers)) return out @@ -258,13 +258,34 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset: return xr.open_dataset(location, decode_times=False) -def flatten_groups(target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf): - """TODO doc""" - raise NotImplementedError('TODO') +def _get_leaf_groups(group: Netcdf | Group) -> list[str]: + group_children = [child for child in group.choice if isinstance(child, Group)] + if len(group_children) == 0: + return [group.name] + leafs = [] + for child in group_children: + leafs += _get_leaf_groups(child) + return leafs + + +def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset: + leaf_groups = _get_leaf_groups(root_group) + dims = {} + enums = {} + for leaf_group in leaf_groups: + # Mutate target to add leaf_group to it + read_group(target, ref, root_group, group_to_read=leaf_group, dims=dims, enums=enums) + return target def read_group( - target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf, group: str, dims: dict = None + target: xr.Dataset, + ref: xr.Dataset | None, + obj: Group | Netcdf, + group_to_read: str = '/', + parent_group_path: str = '', + dims: dict = None, + enums: dict = None, ) -> xr.Dataset: """ Parse items, typically , , and elements. @@ -273,14 +294,14 @@ def read_group( ---------- target : xr.Dataset Target dataset to be updated. - ref : xr.Dataset + ref : xr.Dataset | None Reference dataset used to copy content into `target`. obj : Group | Netcdf - object description. - group : str + object description. + parent_group : str Path of the group to parse within the ncml. - The special value ``*`` opens every group and flatten the variables into a single - dataset. + dims: dict[str, Dimension] + Dictionary of the dimensions of this dataset. Returns ------- @@ -288,19 +309,16 @@ def read_group( Dataset holding variables and attributes defined in element. """ dims = {} if dims is None else dims - enums = {} - if not group.startswith('/'): - group = f'/{group}' - prefix, _, child_path = group.partition('/') - if group == ROOT_GROUP or prefix == '': - current_group = ROOT_GROUP - else: - current_group = prefix + enums = {} if enums is None else enums for item in obj.choice: if isinstance(item, Dimension): - dims[item.name] = read_dimension(item) + dim_name = item.name + if dims.get(dim_name): + dims[dim_name].append(read_dimension(item)) + else: + dims[dim_name] = [read_dimension(item)] elif isinstance(item, Variable): - target = read_variable(target, ref, item, dims, enums, group_path=group) + target = read_variable(target, ref, item, dims, enums, group_path=parent_group_path) elif isinstance(item, Attribute): read_attribute(target, item, ref) elif isinstance(item, Remove): @@ -308,8 +326,15 @@ def read_group( elif isinstance(item, EnumTypedef): enums[item.name] = read_enum(item) elif isinstance(item, Group): - if item.name in child_path: - target = read_group(target, ref, item, current_group, dims) + if item.name in group_to_read: + target = read_group( + target, + ref, + item, + parent_group_path=f'{parent_group_path}/{item.name}', + dims=dims, + group_to_read=group_to_read, + ) else: # ignore group continue @@ -317,10 +342,19 @@ def read_group( pass # elements are parsed in `read_netcdf` else: raise AttributeError - return target +def _build_snake_name(prefix: str, suffix: str) -> str: + if prefix == ROOT_GROUP: + pre = '' + else: + pre = prefix.removeprefix('/') + pre = pre.replace('/', '_') + pre += '_' + return f'{pre}{suffix}' + + def read_scan(obj: Aggregation.Scan, ncml: Path) -> list[xr.Dataset]: """ Return list of datasets defined in element. @@ -448,7 +482,7 @@ def read_variable( dimensions: dict, enums: dict, group_path: str, -): +) -> xr.Dataset: """ Parse element. @@ -465,7 +499,7 @@ def read_variable( enums: dict[str, dict] The enums types that have been read in the upper groups. group_path: str - Full path to the parent group + Path to the parent group Returns ------- @@ -485,20 +519,32 @@ def read_variable( ref_var = None # Read existing data or create empty DataArray - if (obj.name in target) or (obj.name in target.dims): + if obj.name in target or obj.name in target.dims: out = xr.as_variable(target[obj.name]) if obj.type: out = out.astype(nctype(obj.type)) ref_var = None - elif (obj.name in ref) or (obj.name in ref.dims): + elif obj.name in ref or obj.name in ref.dims: out = xr.as_variable(ref[obj.name]) if obj.type: out = out.astype(nctype(obj.type)) ref_var = ref[obj.name] elif obj.shape: - dims = obj.shape.split(' ') - shape = [dimensions[dim].length for dim in dims] - out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=dims) + var_dims = [] + shape = [] + for dim in obj.shape.split(' '): + if dimensions.get(dim) is None: + # TODO: Add test for this branch, might be useless. + err = ( + f"Unknown dimension '{dim}'." + ' Make sure it is declared before being used in the NCML.' + ) + raise ValueError(err) + shape.append(dimensions[dim][-1].length) + if (dim_count := len(dimensions[dim])) > 1: + dim = f'{dim}__{dim_count - 1}' + var_dims.append(dim) + out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=var_dims) elif obj.shape == '': out = build_scalar_variable(var_name=obj.name, values_tag=obj.values, var_type=obj.type) else: @@ -508,10 +554,8 @@ def read_variable( # Set variable attributes for item in obj.attribute: read_attribute(out, item, ref=ref_var) - out.attrs['group_path'] = group_path - # TODO (@bzah): Maybe rename the variable using the parent full path - # if we are flattening, this would avoid overriding an existing - # variable of target. + if group_path: + out.attrs['group_path'] = group_path # Remove attributes or dimensions for item in obj.remove: @@ -537,14 +581,26 @@ def read_variable( raise NotImplementedError if obj.typedef in enums.keys(): - # TODO: Also update encoding when https://github.com/pydata/xarray/pull/8147 - # is merged in xarray. + # TODO (@bzah): Update this once Enums are merged in xarray + # https://github.com/pydata/xarray/pull/8147 out.attrs['flag_values'] = enums[obj.typedef]['flag_values'] out.attrs['flag_meanings'] = enums[obj.typedef]['flag_meanings'] elif obj.typedef is not None: raise NotImplementedError - target[obj.name] = out + var_name = obj.name + # TODO (abel): Fix for aggregation: we must override the variable and not create a new one in aggregation case + # How to differentiate this case with variables in different groups that bear the same name ? + # Should we carry a flag `override` from read_aggregation ? + # Or when a flag to know we are in sub-groups ? + if ( + (existing_var := target.get(var_name)) is not None + and existing_var.attrs.get('group_path') + and existing_var.attrs.get('group_path') != group_path + ): + similar_var_count = len([v for v in target.variables.keys() if f'{var_name}__' in v]) + var_name = f'{var_name}__{similar_var_count + 1}' + target[var_name] = out return target From 87aa891df4dc6021f52409695b42740938643608 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Tue, 16 Jan 2024 12:09:29 +0100 Subject: [PATCH 04/12] Add test for error --- tests/data/testGroupInvalidDim.xml | 7 +++++++ tests/test_parser.py | 5 +++++ xncml/parser.py | 5 ----- 3 files changed, 12 insertions(+), 5 deletions(-) create mode 100644 tests/data/testGroupInvalidDim.xml diff --git a/tests/data/testGroupInvalidDim.xml b/tests/data/testGroupInvalidDim.xml new file mode 100644 index 0000000..5ddce24 --- /dev/null +++ b/tests/data/testGroupInvalidDim.xml @@ -0,0 +1,7 @@ + + + + 3 + + + diff --git a/tests/test_parser.py b/tests/test_parser.py index a0119bf..3cb0851 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -367,6 +367,11 @@ def test_read_group__conflicting_dims(): assert 'index' in ds.gr_b_var.dims +def test_read__invalid_dim(): + with pytest.raises(ValueError, match="Unknown dimension 'myDim'.*"): + xncml.open_ncml(data / 'testGroupInvalidDim.xml') + + def test_flatten_groups(): """Read every group and flatten everything in a single dataset/group.""" ds = xncml.open_ncml(data / 'testGroup.xml', group='*') diff --git a/xncml/parser.py b/xncml/parser.py index dc3d144..2c5dd1f 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -534,7 +534,6 @@ def read_variable( shape = [] for dim in obj.shape.split(' '): if dimensions.get(dim) is None: - # TODO: Add test for this branch, might be useless. err = ( f"Unknown dimension '{dim}'." ' Make sure it is declared before being used in the NCML.' @@ -589,10 +588,6 @@ def read_variable( raise NotImplementedError var_name = obj.name - # TODO (abel): Fix for aggregation: we must override the variable and not create a new one in aggregation case - # How to differentiate this case with variables in different groups that bear the same name ? - # Should we carry a flag `override` from read_aggregation ? - # Or when a flag to know we are in sub-groups ? if ( (existing_var := target.get(var_name)) is not None and existing_var.attrs.get('group_path') From 3243a011a74c86f2f964685d9ecbbddf8fff68ea Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Tue, 16 Jan 2024 13:55:03 +0100 Subject: [PATCH 05/12] DOC: document changes --- CHANGELOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1727dad..d69d0a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +0.5.0 (unreleased) +================== + +**Breaking changes** +- Nested group handling: + Before this version, all groups were read, but conflicting variable names in-between groups would be cause a lost of data. + Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional `group` argument to specify which group should be read. + When ``group`` is missing, the root group is assumed. + Additionally ``group`` can be set to ``'*'``. In that case every group is read and the hierachy is flatten. + In the event of conflicting names, the variable/dimension name will be updated by appending '__n' where n is incremented. + + 0.4.0 (2024-01-08) ================== From 5daba4a05a0fe6ecebcb37c53876e337f7af1a17 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Tue, 16 Jan 2024 17:15:30 +0100 Subject: [PATCH 06/12] FIX: scalar parsing and nested groups --- tests/data/testGroupMultiLayers.xml | 8 +++++++- tests/test_parser.py | 6 ++++-- xncml/parser.py | 26 ++++++++++++++++++-------- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/tests/data/testGroupMultiLayers.xml b/tests/data/testGroupMultiLayers.xml index 90b35eb..789fac4 100644 --- a/tests/data/testGroupMultiLayers.xml +++ b/tests/data/testGroupMultiLayers.xml @@ -2,8 +2,14 @@ - + + + + + + + diff --git a/tests/test_parser.py b/tests/test_parser.py index 3cb0851..952343c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -314,8 +314,8 @@ def test_empty_scalar__no_values_tag(): filled, because numpy can't create an empty typed scalar. """ ds = xncml.open_ncml(data / 'testEmptyScalar.xml') - assert ds['empty_scalar_var'].dtype == np.dtype('O') - assert ds['empty_scalar_var'].item() is None + assert ds['empty_scalar_var'].dtype == np.dtype('float64') + assert ds['empty_scalar_var'].item() == 0 def test_empty_scalar__with_empty_values_tag(): @@ -396,6 +396,8 @@ def test_flatten_groups__sub_groups(): """Read every group and rename dimensions""" ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*') assert ds.dims['index'] is not None + assert ds['gr_a_var'] is not None + assert ds['gr_b_var'] is not None # --- # diff --git a/xncml/parser.py b/xncml/parser.py index 2c5dd1f..53d4eeb 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -258,18 +258,27 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset: return xr.open_dataset(location, decode_times=False) -def _get_leaf_groups(group: Netcdf | Group) -> list[str]: +def _get_netcdf_leaves(netcdf: Netcdf) -> list[str]: + parent = '' + root_children = [child for child in netcdf.choice if isinstance(child, Group)] + if len(root_children) == 0: + return ['/'] + leaves = [] + for child in root_children: + leaves += list(_get_group_leaves(child, parent=parent)) + return leaves + + +def _get_group_leaves(group: Netcdf | Group, parent: str) -> str: group_children = [child for child in group.choice if isinstance(child, Group)] if len(group_children) == 0: - return [group.name] - leafs = [] + yield f'{parent}/{group.name}' for child in group_children: - leafs += _get_leaf_groups(child) - return leafs + yield from _get_group_leaves(child, parent=f'{parent}/{group.name}') def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset: - leaf_groups = _get_leaf_groups(root_group) + leaf_groups = _get_netcdf_leaves(root_group) dims = {} enums = {} for leaf_group in leaf_groups: @@ -649,7 +658,7 @@ def read_values(var_name: str, expected_size: int, values_tag: Values) -> list: def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> xr.Variable: - """Read values for element. + """Build an xr.Variable for scalar variables. Parameters ---------- @@ -676,7 +685,8 @@ def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> x ' is empty. Provide a single values within ' ' to preserve the type.' ) - return xr.Variable(data=None, dims=()) + default_value = nctype(var_type)() + return xr.Variable(data=default_value, dims=()) values_content = read_values(var_name, expected_size=1, values_tag=values_tag) if len(values_content) == 1: return xr.Variable(data=np.array(values_content[0], dtype=nctype(var_type))[()], dims=()) From 1959e9270a8489bcd084c2d0a05f6806dcf15a87 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Tue, 16 Jan 2024 17:18:13 +0100 Subject: [PATCH 07/12] remove dead code --- xncml/parser.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/xncml/parser.py b/xncml/parser.py index 53d4eeb..ee686ae 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -354,16 +354,6 @@ def read_group( return target -def _build_snake_name(prefix: str, suffix: str) -> str: - if prefix == ROOT_GROUP: - pre = '' - else: - pre = prefix.removeprefix('/') - pre = pre.replace('/', '_') - pre += '_' - return f'{pre}{suffix}' - - def read_scan(obj: Aggregation.Scan, ncml: Path) -> list[xr.Dataset]: """ Return list of datasets defined in element. From c32d43787631b3e00cb655eaec73a32601d4e8b9 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Fri, 2 Feb 2024 10:23:09 +0100 Subject: [PATCH 08/12] Improve generator --- xncml/parser.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/xncml/parser.py b/xncml/parser.py index ee686ae..2c33b4e 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -36,6 +36,7 @@ import datetime as dt from functools import partial from pathlib import Path +from typing import TYPE_CHECKING from warnings import warn import numpy as np @@ -57,6 +58,9 @@ Variable, ) +if TYPE_CHECKING: + from collections.abc import Iterator + __author__ = 'David Huard' __date__ = 'July 2022' __contact__ = 'huard.david@ouranos.ca' @@ -258,30 +262,19 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset: return xr.open_dataset(location, decode_times=False) -def _get_netcdf_leaves(netcdf: Netcdf) -> list[str]: - parent = '' - root_children = [child for child in netcdf.choice if isinstance(child, Group)] - if len(root_children) == 0: - return ['/'] - leaves = [] - for child in root_children: - leaves += list(_get_group_leaves(child, parent=parent)) - return leaves - - -def _get_group_leaves(group: Netcdf | Group, parent: str) -> str: +def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[str]: group_children = [child for child in group.choice if isinstance(child, Group)] + current_path = '/' if parent is None else f'{parent}/{group.name}' if len(group_children) == 0: - yield f'{parent}/{group.name}' + yield current_path for child in group_children: - yield from _get_group_leaves(child, parent=f'{parent}/{group.name}') + yield from _get_leaves(child, parent=current_path) def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset: - leaf_groups = _get_netcdf_leaves(root_group) dims = {} enums = {} - for leaf_group in leaf_groups: + for leaf_group in _get_leaves(root_group): # Mutate target to add leaf_group to it read_group(target, ref, root_group, group_to_read=leaf_group, dims=dims, enums=enums) return target From ff807efa4b35b1010da88d6223143b54b7ff80c3 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Fri, 2 Feb 2024 17:13:49 +0100 Subject: [PATCH 09/12] ENH: Improve group parsing - improved performances by parsing ncml reprsentation only once - fixed issues of rewritting content when read multiple times --- tests/data/testGroupMultiLayers.xml | 7 ++- tests/test_parser.py | 9 ++-- xncml/parser.py | 71 +++++++++++++++-------------- 3 files changed, 49 insertions(+), 38 deletions(-) diff --git a/tests/data/testGroupMultiLayers.xml b/tests/data/testGroupMultiLayers.xml index 789fac4..4d41694 100644 --- a/tests/data/testGroupMultiLayers.xml +++ b/tests/data/testGroupMultiLayers.xml @@ -1,15 +1,18 @@ + + 2 + - + - + diff --git a/tests/test_parser.py b/tests/test_parser.py index 952343c..33a70bd 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -395,9 +395,12 @@ def test_flatten_groups__conflicting_dims(): def test_flatten_groups__sub_groups(): """Read every group and rename dimensions""" ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*') - assert ds.dims['index'] is not None - assert ds['gr_a_var'] is not None - assert ds['gr_b_var'] is not None + assert ds.dims['index'] == 42 + assert ds.dims['index__1'] == 22 + assert ds['a_var'].size == 1 + assert ds['a_var'] == 2 + assert ds['a_var__1'].size == 42 + assert ds['a_var__2'].size == 22 # --- # diff --git a/xncml/parser.py b/xncml/parser.py index 2c33b4e..1ce4b3c 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -155,7 +155,7 @@ def read_netcdf( else: if not group.startswith('/'): group = f'/{group}' - target = read_group(target, ref, obj, group_to_read=group) + target = read_group(target, ref, obj, groups_to_read=[group]) return target @@ -228,7 +228,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat else: raise NotImplementedError - agg = read_group(agg, ref=None, obj=obj) + agg = read_group(agg, ref=None, obj=obj, groups_to_read=['/']) out = target.merge(agg, combine_attrs='no_conflicts') out.set_close(partial(_multi_file_closer, closers)) return out @@ -264,7 +264,7 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset: def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[str]: group_children = [child for child in group.choice if isinstance(child, Group)] - current_path = '/' if parent is None else f'{parent}/{group.name}' + current_path = '' if parent is None else f'{parent}/{group.name}' if len(group_children) == 0: yield current_path for child in group_children: @@ -274,9 +274,8 @@ def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[st def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset: dims = {} enums = {} - for leaf_group in _get_leaves(root_group): - # Mutate target to add leaf_group to it - read_group(target, ref, root_group, group_to_read=leaf_group, dims=dims, enums=enums) + leaves_group = list(_get_leaves(root_group)) + read_group(target, ref, root_group, groups_to_read=leaves_group, dims=dims, enums=enums) return target @@ -284,8 +283,8 @@ def read_group( target: xr.Dataset, ref: xr.Dataset | None, obj: Group | Netcdf, - group_to_read: str = '/', - parent_group_path: str = '', + groups_to_read: list[str], + parent_group_path: str = '/', dims: dict = None, enums: dict = None, ) -> xr.Dataset: @@ -328,14 +327,14 @@ def read_group( elif isinstance(item, EnumTypedef): enums[item.name] = read_enum(item) elif isinstance(item, Group): - if item.name in group_to_read: + if any(item.name in group_name for group_name in groups_to_read): target = read_group( target, ref, item, - parent_group_path=f'{parent_group_path}/{item.name}', + parent_group_path=f'{parent_group_path}{item.name}/', dims=dims, - group_to_read=group_to_read, + groups_to_read=groups_to_read, ) else: # ignore group @@ -510,17 +509,22 @@ def read_variable( else: ref_var = None + var_name = obj.name # Read existing data or create empty DataArray - if obj.name in target or obj.name in target.dims: - out = xr.as_variable(target[obj.name]) + if (existing_var := target.get(var_name)) is not None and existing_var.attrs.get( + 'group_path' + ) in [None, group_path]: + out = xr.as_variable(target[var_name]) if obj.type: out = out.astype(nctype(obj.type)) ref_var = None - elif obj.name in ref or obj.name in ref.dims: - out = xr.as_variable(ref[obj.name]) + elif (existing_var := ref.get(var_name)) is not None and existing_var.attrs.get( + 'group_path' + ) in [None, group_path]: + out = xr.as_variable(ref[var_name]) if obj.type: out = out.astype(nctype(obj.type)) - ref_var = ref[obj.name] + ref_var = ref[var_name] elif obj.shape: var_dims = [] shape = [] @@ -537,16 +541,15 @@ def read_variable( var_dims.append(dim) out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=var_dims) elif obj.shape == '': - out = build_scalar_variable(var_name=obj.name, values_tag=obj.values, var_type=obj.type) + out = build_scalar_variable(var_name=var_name, values_tag=obj.values, var_type=obj.type) else: - error_msg = f'Could not build variable `{obj.name}`.' + error_msg = f'Could not build variable `{var_name }`.' raise ValueError(error_msg) # Set variable attributes for item in obj.attribute: read_attribute(out, item, ref=ref_var) - if group_path: - out.attrs['group_path'] = group_path + out.attrs['group_path'] = group_path # Remove attributes or dimensions for item in obj.remove: @@ -554,7 +557,7 @@ def read_variable( # Read values for arrays (already done for a scalar) if obj.values and obj.shape != '': - data = read_values(obj.name, out.size, obj.values) + data = read_values(var_name, out.size, obj.values) data = out.dtype.type(data) out = xr.Variable( out.dims, @@ -578,15 +581,16 @@ def read_variable( out.attrs['flag_meanings'] = enums[obj.typedef]['flag_meanings'] elif obj.typedef is not None: raise NotImplementedError + import re - var_name = obj.name - if ( - (existing_var := target.get(var_name)) is not None - and existing_var.attrs.get('group_path') - and existing_var.attrs.get('group_path') != group_path - ): - similar_var_count = len([v for v in target.variables.keys() if f'{var_name}__' in v]) - var_name = f'{var_name}__{similar_var_count + 1}' + reg = re.compile(f'^{var_name}__|{var_name}') + similar_vars_but_diff_path = [ + v + for v in target.data_vars + if reg.match(v) and target[v].attrs.get('group_path') not in [None, group_path] + ] + if len(similar_vars_but_diff_path) > 0: + var_name = f'{var_name}__{len(similar_vars_but_diff_path)}' target[var_name] = out return target @@ -611,7 +615,8 @@ def read_values(var_name: str, expected_size: int, values_tag: Values) -> list: if values_tag.from_attribute is not None: error_msg = ( 'xncml cannot yet fetch values from a global or a ' - f' variable attribute using , here on variable {var_name}.' + ' variable attribute using , here on variable' + f' {var_name}.' ) raise NotImplementedError(error_msg) if values_tag.start is not None and values_tag.increment is not None: @@ -663,12 +668,12 @@ def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> x If the tag is not a valid scalar. """ if values_tag is None: + default_value = nctype(var_type)() warn( - f'Could not set the type for the scalar variable {var_name}, as its' - ' is empty. Provide a single values within ' + f'The scalar variable {var_name} has no values set within' + f' . A default value of {default_value} is set' ' to preserve the type.' ) - default_value = nctype(var_type)() return xr.Variable(data=default_value, dims=()) values_content = read_values(var_name, expected_size=1, values_tag=values_tag) if len(values_content) == 1: From 18a7c8b2b54160d327fee870f35f2450c160c8a1 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Mon, 5 Feb 2024 13:45:11 +0100 Subject: [PATCH 10/12] Clean --- tests/test_parser.py | 8 ++++---- xncml/parser.py | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 33a70bd..9676daa 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -310,8 +310,8 @@ def test_unsigned_type(): def test_empty_scalar__no_values_tag(): """ - A scalar without a tag will not be typed, even if the 'type' attribute is - filled, because numpy can't create an empty typed scalar. + A scalar variable which is missing will have its value set to + the default value of its type. """ ds = xncml.open_ncml(data / 'testEmptyScalar.xml') assert ds['empty_scalar_var'].dtype == np.dtype('float64') @@ -319,13 +319,13 @@ def test_empty_scalar__no_values_tag(): def test_empty_scalar__with_empty_values_tag(): - """A scalar with an empty in its tag is invalid.""" + """A scalar with an empty tag is invalid.""" with pytest.raises(ValueError, match='No values found for variable .*'): xncml.open_ncml(data / 'testEmptyScalar_withValuesTag.xml') def test_multiple_values_for_scalar(): - """a scalar with multiple values in its tag is invalid.""" + """A scalar with multiple values in its tag is invalid.""" with pytest.raises(ValueError, match='The expected size for variable .* was 1, .*'): xncml.open_ncml(data / 'testEmptyScalar_withMultipleValues.xml') diff --git a/xncml/parser.py b/xncml/parser.py index b7649ee..8b20cd6 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -86,7 +86,7 @@ def parse(path: Path) -> Netcdf: return parser.from_path(path, Netcdf) -def open_ncml(ncml: str | Path, group: str = '/') -> xr.Dataset: +def open_ncml(ncml: str | Path, group: str = ROOT_GROUP) -> xr.Dataset: """ Convert NcML document to a dataset. @@ -227,7 +227,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat else: raise NotImplementedError - agg = read_group(agg, ref=None, obj=obj, groups_to_read=['/']) + agg = read_group(agg, ref=None, obj=obj, groups_to_read=[ROOT_GROUP]) out = target.merge(agg, combine_attrs='no_conflicts') out.set_close(partial(_multi_file_closer, closers)) return out @@ -283,7 +283,7 @@ def read_group( ref: xr.Dataset | None, obj: Group | Netcdf, groups_to_read: list[str], - parent_group_path: str = '/', + parent_group_path: str = ROOT_GROUP, dims: dict = None, enums: dict = None, ) -> xr.Dataset: @@ -311,6 +311,7 @@ def read_group( dims = {} if dims is None else dims enums = {} if enums is None else enums for item in obj.choice: + if isinstance(item, Dimension): dim_name = item.name if dims.get(dim_name): From ae842fe9378da8985f613fca7abd21c500d0ee52 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Mon, 5 Feb 2024 18:10:57 +0100 Subject: [PATCH 11/12] Apply suggestions from code review Co-authored-by: David Huard --- CHANGELOG.md | 6 +----- xncml/parser.py | 10 +++++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d69d0a6..68f0d9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,7 @@ **Breaking changes** - Nested group handling: - Before this version, all groups were read, but conflicting variable names in-between groups would be cause a lost of data. - Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional `group` argument to specify which group should be read. - When ``group`` is missing, the root group is assumed. - Additionally ``group`` can be set to ``'*'``. In that case every group is read and the hierachy is flatten. - In the event of conflicting names, the variable/dimension name will be updated by appending '__n' where n is incremented. + Before this version, all groups were read, but conflicting variable names in-between groups would shadow data. Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional ``group`` argument to specify which group should be read. When ``group`` is not specified, it defaults to the root group. Additionally ``group`` can be set to ``'*'`` so that every group is read and the hierarchy is flattened. In the event of conflicting variable/dimension names across groups, the conflicting name will be modified by appending ``'__n'`` where n is incremented. 0.4.0 (2024-01-08) diff --git a/xncml/parser.py b/xncml/parser.py index 8b20cd6..320b3de 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -60,7 +60,7 @@ if TYPE_CHECKING: from collections.abc import Iterator -__author__ = 'David Huard' +__author__ = 'David Huard, Abel Aoun' __date__ = 'July 2022' __contact__ = 'huard.david@ouranos.ca' @@ -96,7 +96,7 @@ def open_ncml(ncml: str | Path, group: str = ROOT_GROUP) -> xr.Dataset: Path to NcML file. group : str Path of the group to parse within the ncml. - The special value ``*`` opens every group and flatten the variables into a single + The special value ``*`` opens every group and flattens the variables into a single dataset, renaming variables and dimensions if conflicting names are found. Returns @@ -129,7 +129,7 @@ def read_netcdf( Path to NcML document, sometimes required to follow relative links. group : str Path of the group to parse within the ncml. - The special value ``*`` opens every group and flatten the variables into a single + The special value ``*`` opens every group and flattens the variables into a single dataset. Returns @@ -488,9 +488,9 @@ def read_variable( dimensions : dict Dimension attributes keyed by name. enums: dict[str, dict] - The enums types that have been read in the upper groups. + The enums types that have been read in the parent groups. group_path: str - Path to the parent group + Path to the parent group. Returns ------- From dac587080c70712b066b34c5482fbe4f8ac5ad25 Mon Sep 17 00:00:00 2001 From: Abel Aoun Date: Mon, 5 Feb 2024 18:18:55 +0100 Subject: [PATCH 12/12] ENH: Improve parser following code review --- xncml/parser.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/xncml/parser.py b/xncml/parser.py index 320b3de..a71f249 100644 --- a/xncml/parser.py +++ b/xncml/parser.py @@ -150,7 +150,7 @@ def read_netcdf( for item in filter_by_class(obj.choice, Aggregation): target = read_aggregation(target, item, ncml) if group == FLATTEN_GROUPS: - target = flatten_groups(target, ref, obj) + target = _flatten_groups(target, ref, obj) else: if not group.startswith('/'): group = f'/{group}' @@ -263,14 +263,14 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset: def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[str]: group_children = [child for child in group.choice if isinstance(child, Group)] - current_path = '' if parent is None else f'{parent}/{group.name}' + current_path = ROOT_GROUP if parent is None else f'{parent}{group.name}/' if len(group_children) == 0: yield current_path for child in group_children: yield from _get_leaves(child, parent=current_path) -def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset: +def _flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset: dims = {} enums = {} leaves_group = list(_get_leaves(root_group)) @@ -298,8 +298,10 @@ def read_group( Reference dataset used to copy content into `target`. obj : Group | Netcdf object description. - parent_group : str - Path of the group to parse within the ncml. + groups_to_read : list[str] + List of groups that must be read and included in `target`. + parent_group_path : str + Path of parent group, by default the root group '/'. dims: dict[str, Dimension] Dictionary of the dimensions of this dataset. @@ -311,7 +313,6 @@ def read_group( dims = {} if dims is None else dims enums = {} if enums is None else enums for item in obj.choice: - if isinstance(item, Dimension): dim_name = item.name if dims.get(dim_name):