Skip to content

Commit

Permalink
Enh/manage groups (#66)
Browse files Browse the repository at this point in the history
* ENH: Add group parameter to open_ncml

- Modify the default behavior to read only the root group.
- Make it possible to read a specific group with `group="path/to/group".
- Update the flattening capabilities to read nested groups
When open_ncml is called with group="*", every group will be read
and they will be flatten in the resulting dataset.
If names are conflicting, the dimensions and variables names are
appended with a `__n` where n is the number of existing similar names.

Also update the parsing of scalar, when their <values> tag in the ncml is empty: 
now the scalar type is preserved and a default value is set to it.

---------

Co-authored-by: Abel Aoun <[email protected]>
Co-authored-by: David Huard <[email protected]>
  • Loading branch information
3 people authored Feb 6, 2024
1 parent c5c5972 commit dc7ab52
Show file tree
Hide file tree
Showing 7 changed files with 263 additions and 46 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
0.5.0 (unreleased)
==================

**Breaking changes**
- Nested group handling:
Before this version, all groups were read, but conflicting variable names in-between groups would shadow data. Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional ``group`` argument to specify which group should be read. When ``group`` is not specified, it defaults to the root group. Additionally ``group`` can be set to ``'*'`` so that every group is read and the hierarchy is flattened. In the event of conflicting variable/dimension names across groups, the conflicting name will be modified by appending ``'__n'`` where n is incremented.


0.4.0 (2024-01-08)
==================

Expand Down
16 changes: 16 additions & 0 deletions tests/data/testGroup.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
<variable name="toto" shape="" type="ushort">
<values>3</values>
</variable>
<group name="a_sub_group">
<variable name="group_var" shape="" type="ushort">
<values>1</values>
</variable>
</group>
<group name="another_sub_group">
<variable name="other_group_var" shape="" type="ushort">
<values>2</values>
</variable>
</group>
</netcdf>
11 changes: 11 additions & 0 deletions tests/data/testGroupConflictingDims.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
<group name="gr_a">
<dimension name="index" length="42"/>
<variable name="gr_a_var" shape="index" type="ushort"></variable>
</group>
<group name="gr_b">
<dimension name="index" length="94"/>
<variable name="gr_b_var" shape="index" type="ushort"></variable>
</group>
</netcdf>
7 changes: 7 additions & 0 deletions tests/data/testGroupInvalidDim.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
<variable name="toto" shape="myDim" type="ushort">
<values>3</values>
</variable>
<dimension name="myDim"></dimension>
</netcdf>
18 changes: 18 additions & 0 deletions tests/data/testGroupMultiLayers.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?xml version="1.0" encoding="UTF-8"?>
<netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
<variable name="a_var" shape="" type="ushort">
<values>2</values>
</variable>
<group name="gr_a">
<dimension name="index" length="42"/>
<group name="sub_gr">
<variable name="a_var" shape="index" type="ushort"></variable>
</group>
</group>
<group name="gr_b">
<dimension name="index" length="22"/>
<group name="sub_gr">
<variable name="a_var" shape="index" type="ushort"></variable>
</group>
</group>
</netcdf>
72 changes: 66 additions & 6 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,22 +310,22 @@ def test_unsigned_type():

def test_empty_scalar__no_values_tag():
"""
Scalar without values loose their type because we can't create a typed numpy
scalar which is empty
A scalar variable which <values> is missing will have its value set to
the default value of its type.
"""
ds = xncml.open_ncml(data / 'testEmptyScalar.xml')
assert ds['empty_scalar_var'].dtype == np.dtype('O')
assert ds['empty_scalar_var'].item() is None
assert ds['empty_scalar_var'].dtype == np.dtype('float64')
assert ds['empty_scalar_var'].item() == 0


def test_empty_scalar__with_empty_values_tag():
"""A scalar variable with an empty <values> tag is invalid."""
"""A scalar with an empty <values> tag is invalid."""
with pytest.raises(ValueError, match='No values found for variable .*'):
xncml.open_ncml(data / 'testEmptyScalar_withValuesTag.xml')


def test_multiple_values_for_scalar():
"""Scalar with an multiple values in <values> tag is invalid."""
"""A scalar with multiple values in its <values> tag is invalid."""
with pytest.raises(ValueError, match='The expected size for variable .* was 1, .*'):
xncml.open_ncml(data / 'testEmptyScalar_withMultipleValues.xml')

Expand All @@ -343,6 +343,66 @@ def test_empty_attr():
assert ds.attrs['comment'] == ''


def test_read_group__read_only_root_group():
"""By default, only read root group."""
ds = xncml.open_ncml(data / 'testGroup.xml')
assert ds.toto is not None
assert ds.get('group_var') is None
assert ds.get('other_group_var') is None


def test_read_group__read_sub_group():
"""Read specified sub group and its parents."""
ds = xncml.open_ncml(data / 'testGroup.xml', group='a_sub_group')
assert ds.toto is not None
assert ds.get('group_var') is not None
ds.group_var.attrs['group_path'] = '/a_sub_group'
assert ds.get('other_group_var') is None


def test_read_group__conflicting_dims():
"""Read a group and ensure its dimension is correct"""
ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='gr_b')
assert ds.dims['index'] == 94
assert 'index' in ds.gr_b_var.dims


def test_read__invalid_dim():
with pytest.raises(ValueError, match="Unknown dimension 'myDim'.*"):
xncml.open_ncml(data / 'testGroupInvalidDim.xml')


def test_flatten_groups():
"""Read every group and flatten everything in a single dataset/group."""
ds = xncml.open_ncml(data / 'testGroup.xml', group='*')
assert ds.toto is not None
assert ds.get('toto__1') is None
assert ds.get('group_var') is not None
ds.group_var.attrs['group_path'] = '/a_sub_group'
assert ds.get('other_group_var') is not None
ds.other_group_var.attrs['group_path'] = '/another_sub_group'


def test_flatten_groups__conflicting_dims():
"""Read every group and rename dimensions"""
ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='*')
assert 'index' in ds.gr_a_var.dims
assert ds.dims['index'] is not None
assert 'index__1' in ds.gr_b_var.dims
assert ds.dims['index__1'] is not None


def test_flatten_groups__sub_groups():
"""Read every group and rename dimensions"""
ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*')
assert ds.dims['index'] == 42
assert ds.dims['index__1'] == 22
assert ds['a_var'].size == 1
assert ds['a_var'] == 2
assert ds['a_var__1'].size == 42
assert ds['a_var__2'].size == 22


# --- #
def check_dimension(ds):
assert len(ds['lat']) == 3
Expand Down
Loading

0 comments on commit dc7ab52

Please sign in to comment.