From d5df9153351f3f9b09499532666023a14073eb49 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Thu, 11 Jan 2024 20:01:12 +0100
Subject: [PATCH 01/12] FIX: pre-commit hook for black

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d2cbbc2..0a36b3e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
       -   id: check-yaml
       -   id: double-quote-string-fixer
 
-  -   repo: https://github.com/psf/black
+  -   repo: https://github.com/psf/black-pre-commit-mirror
       rev: 23.12.1
       hooks:
       - id: black

From be6acbc66e1454e301312fef9ed5551251555caa Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Fri, 12 Jan 2024 14:35:13 +0100
Subject: [PATCH 02/12] ENH: Add group filtering to open_ncml

---
 tests/data/testGroup.xml | 11 +++++++
 tests/test_parser.py     | 35 +++++++++++++++++---
 xncml/parser.py          | 70 +++++++++++++++++++++++++++++++++-------
 3 files changed, 100 insertions(+), 16 deletions(-)
 create mode 100644 tests/data/testGroup.xml

diff --git a/tests/data/testGroup.xml b/tests/data/testGroup.xml
new file mode 100644
index 0000000..b798b4d
--- /dev/null
+++ b/tests/data/testGroup.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
+    <variable name="toto" shape="" type="ushort">
+    </variable>
+    <group name="a_sub_group">
+        <variable name="group_var" shape="" type="ushort"></variable>
+    </group>
+    <group name="another_sub_group">
+        <variable name="other_group_var" shape="" type="ushort"></variable>
+    </group>
+</netcdf>
diff --git a/tests/test_parser.py b/tests/test_parser.py
index ddab016..4a1f259 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -310,8 +310,8 @@ def test_unsigned_type():
 
 def test_empty_scalar__no_values_tag():
     """
-    Scalar without values loose their type because we can't create a typed numpy
-    scalar which is empty
+    A scalar without a <values> tag will not be typed, even if the 'type' attribute is
+    filled, because numpy can't create an empty typed scalar.
     """
     ds = xncml.open_ncml(data / 'testEmptyScalar.xml')
     assert ds['empty_scalar_var'].dtype == np.dtype('O')
@@ -319,13 +319,13 @@ def test_empty_scalar__no_values_tag():
 
 
 def test_empty_scalar__with_empty_values_tag():
-    """A scalar variable with an empty <values> tag is invalid."""
+    """A scalar with an empty in its <values> tag is invalid."""
     with pytest.raises(ValueError, match='No values found for variable .*'):
         xncml.open_ncml(data / 'testEmptyScalar_withValuesTag.xml')
 
 
 def test_multiple_values_for_scalar():
-    """Scalar with an multiple values in <values> tag is invalid."""
+    """a scalar with multiple values in its <values> tag is invalid."""
     with pytest.raises(ValueError, match='The expected size for variable .* was 1, .*'):
         xncml.open_ncml(data / 'testEmptyScalar_withMultipleValues.xml')
 
@@ -343,6 +343,33 @@ def test_empty_attr():
     assert ds.attrs['comment'] == ''
 
 
+def test_read_group__read_only_root_group():
+    """By default, only read root group."""
+    ds = xncml.open_ncml(data / 'testGroup.xml')
+    assert ds.toto is not None
+    assert ds.get('group_var') is None
+    assert ds.get('other_group_var') is None
+
+
+def test_read_group__read_sub_group():
+    """Read specified sub group and its parents."""
+    ds = xncml.open_ncml(data / 'testGroup.xml', group='a_sub_group')
+    assert ds.toto is not None
+    assert ds.get('group_var') is not None
+    ds.group_var.attrs['group_path'] = '/a_sub_group'
+    assert ds.get('other_group_var') is None
+
+
+def test_flatten_groups():
+    """Read every group and flatten everything in a single dataset/group."""
+    ds = xncml.open_ncml(data / 'testGroup.xml', group='*')
+    assert ds.toto is not None
+    assert ds.get('group_var') is not None
+    ds.group_var.attrs['group_path'] = '/a_sub_group'
+    assert ds.get('other_group_var') is not None
+    ds.other_group_var.attrs['group_path'] = '/another_sub_group'
+
+
 # --- #
 def check_dimension(ds):
     assert len(ds['lat']) == 3
diff --git a/xncml/parser.py b/xncml/parser.py
index ef80640..fc3ca0e 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -22,7 +22,6 @@
 - <logicalSection>
 - <logicalSlice>
 - <promoteGlobalAttribute>
-- <enumTypedef>
 - <scanFmrc>
 
 Support for these attributes is missing:
@@ -62,6 +61,9 @@
 __date__ = 'July 2022'
 __contact__ = 'huard.david@ouranos.ca'
 
+FLATTEN_GROUPS = '*'
+ROOT_GROUP = '/'
+
 
 def parse(path: Path) -> Netcdf:
     """
@@ -81,7 +83,7 @@ def parse(path: Path) -> Netcdf:
     return parser.from_path(path, Netcdf)
 
 
-def open_ncml(ncml: str | Path) -> xr.Dataset:
+def open_ncml(ncml: str | Path, group: str = '/') -> xr.Dataset:
     """
     Convert NcML document to a dataset.
 
@@ -89,6 +91,10 @@ def open_ncml(ncml: str | Path) -> xr.Dataset:
     ----------
     ncml : str | Path
       Path to NcML file.
+    group : str
+      Path of the group to parse within the ncml.
+      The special value ``*`` opens every group and flatten the variables into a single
+      dataset.
 
     Returns
     -------
@@ -99,10 +105,12 @@ def open_ncml(ncml: str | Path) -> xr.Dataset:
     ncml = Path(ncml)
     obj = parse(ncml)
 
-    return read_netcdf(xr.Dataset(), xr.Dataset(), obj, ncml)
+    return read_netcdf(xr.Dataset(), xr.Dataset(), obj, ncml, group)
 
 
-def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) -> xr.Dataset:
+def read_netcdf(
+    target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path, group: str
+) -> xr.Dataset:
     """
     Return content of <netcdf> element.
 
@@ -116,6 +124,10 @@ def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) ->
        <netcdf> object description.
     ncml : Path
       Path to NcML document, sometimes required to follow relative links.
+    group : str
+      Path of the group to parse within the ncml.
+      The special value ``*`` opens every group and flatten the variables into a single
+      dataset.
 
     Returns
     -------
@@ -136,8 +148,10 @@ def read_netcdf(target: xr.Dataset, ref: xr.Dataset, obj: Netcdf, ncml: Path) ->
         target = read_aggregation(target, item, ncml)
 
     # Handle <variable>, <attribute> and <remove> elements
-    target = read_group(target, ref, obj)
-
+    if group == FLATTEN_GROUPS:
+        target = flatten_groups(target, ref, obj)
+    else:
+        target = read_group(target, ref, obj, group)
     return target
 
 
@@ -173,7 +187,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat
 
     for item in obj.netcdf:
         # Open dataset defined in <netcdf>'s `location` attribute
-        tar = read_netcdf(xr.Dataset(), ref=xr.Dataset(), obj=item, ncml=ncml)
+        tar = read_netcdf(xr.Dataset(), ref=xr.Dataset(), obj=item, ncml=ncml, group=ROOT_GROUP)
         closers.append(getattr(tar, '_close'))
 
         # Select variables
@@ -210,7 +224,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat
     else:
         raise NotImplementedError
 
-    agg = read_group(agg, None, obj)
+    agg = read_group(agg, None, obj, group=ROOT_GROUP)
     out = target.merge(agg, combine_attrs='no_conflicts')
     out.set_close(partial(_multi_file_closer, closers))
     return out
@@ -244,8 +258,13 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset:
         return xr.open_dataset(location, decode_times=False)
 
 
+def flatten_groups(target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf):
+    """TODO doc"""
+    raise NotImplementedError('TODO')
+
+
 def read_group(
-    target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf, dims: dict = None
+    target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf, group: str, dims: dict = None
 ) -> xr.Dataset:
     """
     Parse <group> items, typically <dimension>, <variable>, <attribute> and <remove> elements.
@@ -258,6 +277,10 @@ def read_group(
       Reference dataset used to copy content into `target`.
     obj : Group | Netcdf
        <netcdf> object description.
+    group : str
+      Path of the group to parse within the ncml.
+      The special value ``*`` opens every group and flatten the variables into a single
+      dataset.
 
     Returns
     -------
@@ -266,11 +289,18 @@ def read_group(
     """
     dims = {} if dims is None else dims
     enums = {}
+    if not group.startswith('/'):
+        group = f'/{group}'
+    prefix, _, child_path = group.partition('/')
+    if group == ROOT_GROUP or prefix == '':
+        current_group = ROOT_GROUP
+    else:
+        current_group = prefix
     for item in obj.choice:
         if isinstance(item, Dimension):
             dims[item.name] = read_dimension(item)
         elif isinstance(item, Variable):
-            target = read_variable(target, ref, item, dims, enums)
+            target = read_variable(target, ref, item, dims, enums, group_path=group)
         elif isinstance(item, Attribute):
             read_attribute(target, item, ref)
         elif isinstance(item, Remove):
@@ -278,7 +308,11 @@ def read_group(
         elif isinstance(item, EnumTypedef):
             enums[item.name] = read_enum(item)
         elif isinstance(item, Group):
-            target = read_group(target, ref, item, dims)
+            if item.name in child_path:
+                target = read_group(target, ref, item, current_group, dims)
+            else:
+                # ignore group
+                continue
         elif isinstance(item, Aggregation):
             pass  # <aggregation> elements are parsed in `read_netcdf`
         else:
@@ -408,7 +442,12 @@ def read_enum(obj: EnumTypedef) -> dict[str, list]:
 
 
 def read_variable(
-    target: xr.Dataset, ref: xr.Dataset, obj: Variable, dimensions: dict, enums: dict
+    target: xr.Dataset,
+    ref: xr.Dataset,
+    obj: Variable,
+    dimensions: dict,
+    enums: dict,
+    group_path: str,
 ):
     """
     Parse <variable> element.
@@ -424,6 +463,9 @@ def read_variable(
     dimensions : dict
       Dimension attributes keyed by name.
     enums: dict[str, dict]
+      The enums types that have been read in the upper groups.
+    group_path: str
+      Full path to the parent group
 
     Returns
     -------
@@ -466,6 +508,10 @@ def read_variable(
     # Set variable attributes
     for item in obj.attribute:
         read_attribute(out, item, ref=ref_var)
+    out.attrs['group_path'] = group_path
+    # TODO (@bzah): Maybe rename the variable using the parent full path
+    #       if we are flattening, this would avoid overriding an existing
+    #       variable of target.
 
     # Remove attributes or dimensions
     for item in obj.remove:

From d96544322951cc6ba407abb019e46f5a63e9e633 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Fri, 12 Jan 2024 20:08:13 +0100
Subject: [PATCH 03/12] ENH: Add flatten capabilities to read nested groups

When open_ncml is called with group="*", every group will be read
and they will be flatten in the resulting dataset.
If names are conflicting, the dimensions and varaibles names are
appended with __n where n is the number of existing simlar names.
---
 tests/data/testGroup.xml                |   9 +-
 tests/data/testGroupConflictingDims.xml |  11 ++
 tests/data/testGroupMultiLayers.xml     |   9 ++
 tests/test_parser.py                    |  23 ++++
 xncml/parser.py                         | 138 +++++++++++++++++-------
 5 files changed, 147 insertions(+), 43 deletions(-)
 create mode 100644 tests/data/testGroupConflictingDims.xml
 create mode 100644 tests/data/testGroupMultiLayers.xml

diff --git a/tests/data/testGroup.xml b/tests/data/testGroup.xml
index b798b4d..00260ac 100644
--- a/tests/data/testGroup.xml
+++ b/tests/data/testGroup.xml
@@ -1,11 +1,16 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
     <variable name="toto" shape="" type="ushort">
+        <values>3</values>
     </variable>
     <group name="a_sub_group">
-        <variable name="group_var" shape="" type="ushort"></variable>
+        <variable name="group_var" shape="" type="ushort">
+            <values>1</values>
+        </variable>
     </group>
     <group name="another_sub_group">
-        <variable name="other_group_var" shape="" type="ushort"></variable>
+        <variable name="other_group_var" shape="" type="ushort">
+            <values>2</values>
+        </variable>
     </group>
 </netcdf>
diff --git a/tests/data/testGroupConflictingDims.xml b/tests/data/testGroupConflictingDims.xml
new file mode 100644
index 0000000..c4cf6eb
--- /dev/null
+++ b/tests/data/testGroupConflictingDims.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
+    <group name="gr_a">
+        <dimension name="index" length="42"/>
+        <variable name="gr_a_var" shape="index" type="ushort"></variable>
+    </group>
+    <group name="gr_b">
+        <dimension name="index" length="94"/>
+        <variable name="gr_b_var" shape="index" type="ushort"></variable>
+    </group>
+</netcdf>
diff --git a/tests/data/testGroupMultiLayers.xml b/tests/data/testGroupMultiLayers.xml
new file mode 100644
index 0000000..90b35eb
--- /dev/null
+++ b/tests/data/testGroupMultiLayers.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
+    <group name="gr_a">
+        <dimension name="index" length="42"/>
+        <group name="sub_gr_a">
+            <variable name="gr_a_var" shape="index" type="ushort"></variable>
+        </group>
+    </group>
+</netcdf>
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 4a1f259..a0119bf 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -360,16 +360,39 @@ def test_read_group__read_sub_group():
     assert ds.get('other_group_var') is None
 
 
+def test_read_group__conflicting_dims():
+    """Read a group and ensure its dimension is correct"""
+    ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='gr_b')
+    assert ds.dims['index'] == 94
+    assert 'index' in ds.gr_b_var.dims
+
+
 def test_flatten_groups():
     """Read every group and flatten everything in a single dataset/group."""
     ds = xncml.open_ncml(data / 'testGroup.xml', group='*')
     assert ds.toto is not None
+    assert ds.get('toto__1') is None
     assert ds.get('group_var') is not None
     ds.group_var.attrs['group_path'] = '/a_sub_group'
     assert ds.get('other_group_var') is not None
     ds.other_group_var.attrs['group_path'] = '/another_sub_group'
 
 
+def test_flatten_groups__conflicting_dims():
+    """Read every group and rename dimensions"""
+    ds = xncml.open_ncml(data / 'testGroupConflictingDims.xml', group='*')
+    assert 'index' in ds.gr_a_var.dims
+    assert ds.dims['index'] is not None
+    assert 'index__1' in ds.gr_b_var.dims
+    assert ds.dims['index__1'] is not None
+
+
+def test_flatten_groups__sub_groups():
+    """Read every group and rename dimensions"""
+    ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*')
+    assert ds.dims['index'] is not None
+
+
 # --- #
 def check_dimension(ds):
     assert len(ds['lat']) == 3
diff --git a/xncml/parser.py b/xncml/parser.py
index fc3ca0e..dc3d144 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -94,7 +94,7 @@ def open_ncml(ncml: str | Path, group: str = '/') -> xr.Dataset:
     group : str
       Path of the group to parse within the ncml.
       The special value ``*`` opens every group and flatten the variables into a single
-      dataset.
+      dataset, renaming variables and dimensions if conflicting names are found.
 
     Returns
     -------
@@ -146,12 +146,12 @@ def read_netcdf(
 
     for item in filter_by_class(obj.choice, Aggregation):
         target = read_aggregation(target, item, ncml)
-
-    # Handle <variable>, <attribute> and <remove> elements
     if group == FLATTEN_GROUPS:
         target = flatten_groups(target, ref, obj)
     else:
-        target = read_group(target, ref, obj, group)
+        if not group.startswith('/'):
+            group = f'/{group}'
+        target = read_group(target, ref, obj, group_to_read=group)
     return target
 
 
@@ -224,7 +224,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat
     else:
         raise NotImplementedError
 
-    agg = read_group(agg, None, obj, group=ROOT_GROUP)
+    agg = read_group(agg, ref=None, obj=obj)
     out = target.merge(agg, combine_attrs='no_conflicts')
     out.set_close(partial(_multi_file_closer, closers))
     return out
@@ -258,13 +258,34 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset:
         return xr.open_dataset(location, decode_times=False)
 
 
-def flatten_groups(target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf):
-    """TODO doc"""
-    raise NotImplementedError('TODO')
+def _get_leaf_groups(group: Netcdf | Group) -> list[str]:
+    group_children = [child for child in group.choice if isinstance(child, Group)]
+    if len(group_children) == 0:
+        return [group.name]
+    leafs = []
+    for child in group_children:
+        leafs += _get_leaf_groups(child)
+    return leafs
+
+
+def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset:
+    leaf_groups = _get_leaf_groups(root_group)
+    dims = {}
+    enums = {}
+    for leaf_group in leaf_groups:
+        # Mutate target to add leaf_group to it
+        read_group(target, ref, root_group, group_to_read=leaf_group, dims=dims, enums=enums)
+    return target
 
 
 def read_group(
-    target: xr.Dataset, ref: xr.Dataset, obj: Group | Netcdf, group: str, dims: dict = None
+    target: xr.Dataset,
+    ref: xr.Dataset | None,
+    obj: Group | Netcdf,
+    group_to_read: str = '/',
+    parent_group_path: str = '',
+    dims: dict = None,
+    enums: dict = None,
 ) -> xr.Dataset:
     """
     Parse <group> items, typically <dimension>, <variable>, <attribute> and <remove> elements.
@@ -273,14 +294,14 @@ def read_group(
     ----------
     target : xr.Dataset
       Target dataset to be updated.
-    ref : xr.Dataset
+    ref : xr.Dataset | None
       Reference dataset used to copy content into `target`.
     obj : Group | Netcdf
-       <netcdf> object description.
-    group : str
+      <netcdf> object description.
+    parent_group : str
       Path of the group to parse within the ncml.
-      The special value ``*`` opens every group and flatten the variables into a single
-      dataset.
+    dims: dict[str, Dimension]
+      Dictionary of the dimensions of this dataset.
 
     Returns
     -------
@@ -288,19 +309,16 @@ def read_group(
       Dataset holding variables and attributes defined in <netcdf> element.
     """
     dims = {} if dims is None else dims
-    enums = {}
-    if not group.startswith('/'):
-        group = f'/{group}'
-    prefix, _, child_path = group.partition('/')
-    if group == ROOT_GROUP or prefix == '':
-        current_group = ROOT_GROUP
-    else:
-        current_group = prefix
+    enums = {} if enums is None else enums
     for item in obj.choice:
         if isinstance(item, Dimension):
-            dims[item.name] = read_dimension(item)
+            dim_name = item.name
+            if dims.get(dim_name):
+                dims[dim_name].append(read_dimension(item))
+            else:
+                dims[dim_name] = [read_dimension(item)]
         elif isinstance(item, Variable):
-            target = read_variable(target, ref, item, dims, enums, group_path=group)
+            target = read_variable(target, ref, item, dims, enums, group_path=parent_group_path)
         elif isinstance(item, Attribute):
             read_attribute(target, item, ref)
         elif isinstance(item, Remove):
@@ -308,8 +326,15 @@ def read_group(
         elif isinstance(item, EnumTypedef):
             enums[item.name] = read_enum(item)
         elif isinstance(item, Group):
-            if item.name in child_path:
-                target = read_group(target, ref, item, current_group, dims)
+            if item.name in group_to_read:
+                target = read_group(
+                    target,
+                    ref,
+                    item,
+                    parent_group_path=f'{parent_group_path}/{item.name}',
+                    dims=dims,
+                    group_to_read=group_to_read,
+                )
             else:
                 # ignore group
                 continue
@@ -317,10 +342,19 @@ def read_group(
             pass  # <aggregation> elements are parsed in `read_netcdf`
         else:
             raise AttributeError
-
     return target
 
 
+def _build_snake_name(prefix: str, suffix: str) -> str:
+    if prefix == ROOT_GROUP:
+        pre = ''
+    else:
+        pre = prefix.removeprefix('/')
+        pre = pre.replace('/', '_')
+        pre += '_'
+    return f'{pre}{suffix}'
+
+
 def read_scan(obj: Aggregation.Scan, ncml: Path) -> list[xr.Dataset]:
     """
     Return list of datasets defined in <scan> element.
@@ -448,7 +482,7 @@ def read_variable(
     dimensions: dict,
     enums: dict,
     group_path: str,
-):
+) -> xr.Dataset:
     """
     Parse <variable> element.
 
@@ -465,7 +499,7 @@ def read_variable(
     enums: dict[str, dict]
       The enums types that have been read in the upper groups.
     group_path: str
-      Full path to the parent group
+      Path to the parent group
 
     Returns
     -------
@@ -485,20 +519,32 @@ def read_variable(
         ref_var = None
 
     # Read existing data or create empty DataArray
-    if (obj.name in target) or (obj.name in target.dims):
+    if obj.name in target or obj.name in target.dims:
         out = xr.as_variable(target[obj.name])
         if obj.type:
             out = out.astype(nctype(obj.type))
         ref_var = None
-    elif (obj.name in ref) or (obj.name in ref.dims):
+    elif obj.name in ref or obj.name in ref.dims:
         out = xr.as_variable(ref[obj.name])
         if obj.type:
             out = out.astype(nctype(obj.type))
         ref_var = ref[obj.name]
     elif obj.shape:
-        dims = obj.shape.split(' ')
-        shape = [dimensions[dim].length for dim in dims]
-        out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=dims)
+        var_dims = []
+        shape = []
+        for dim in obj.shape.split(' '):
+            if dimensions.get(dim) is None:
+                # TODO: Add test for this branch, might be useless.
+                err = (
+                    f"Unknown dimension '{dim}'."
+                    ' Make sure it is declared before being used in the NCML.'
+                )
+                raise ValueError(err)
+            shape.append(dimensions[dim][-1].length)
+            if (dim_count := len(dimensions[dim])) > 1:
+                dim = f'{dim}__{dim_count - 1}'
+            var_dims.append(dim)
+        out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=var_dims)
     elif obj.shape == '':
         out = build_scalar_variable(var_name=obj.name, values_tag=obj.values, var_type=obj.type)
     else:
@@ -508,10 +554,8 @@ def read_variable(
     # Set variable attributes
     for item in obj.attribute:
         read_attribute(out, item, ref=ref_var)
-    out.attrs['group_path'] = group_path
-    # TODO (@bzah): Maybe rename the variable using the parent full path
-    #       if we are flattening, this would avoid overriding an existing
-    #       variable of target.
+    if group_path:
+        out.attrs['group_path'] = group_path
 
     # Remove attributes or dimensions
     for item in obj.remove:
@@ -537,14 +581,26 @@ def read_variable(
         raise NotImplementedError
 
     if obj.typedef in enums.keys():
-        # TODO: Also update encoding when https://github.com/pydata/xarray/pull/8147
-        #       is merged in xarray.
+        # TODO (@bzah): Update this once Enums are merged in xarray
+        #      https://github.com/pydata/xarray/pull/8147
         out.attrs['flag_values'] = enums[obj.typedef]['flag_values']
         out.attrs['flag_meanings'] = enums[obj.typedef]['flag_meanings']
     elif obj.typedef is not None:
         raise NotImplementedError
 
-    target[obj.name] = out
+    var_name = obj.name
+    # TODO (abel): Fix for aggregation: we must override the variable and not create a new one in aggregation case
+    #               How to differentiate this case with variables in different groups that bear the same name ?
+    #               Should we carry a flag `override` from read_aggregation ?
+    #               Or when a flag to know we are in sub-groups ?
+    if (
+        (existing_var := target.get(var_name)) is not None
+        and existing_var.attrs.get('group_path')
+        and existing_var.attrs.get('group_path') != group_path
+    ):
+        similar_var_count = len([v for v in target.variables.keys() if f'{var_name}__' in v])
+        var_name = f'{var_name}__{similar_var_count + 1}'
+    target[var_name] = out
     return target
 
 

From 87aa891df4dc6021f52409695b42740938643608 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Tue, 16 Jan 2024 12:09:29 +0100
Subject: [PATCH 04/12] Add test for error

---
 tests/data/testGroupInvalidDim.xml | 7 +++++++
 tests/test_parser.py               | 5 +++++
 xncml/parser.py                    | 5 -----
 3 files changed, 12 insertions(+), 5 deletions(-)
 create mode 100644 tests/data/testGroupInvalidDim.xml

diff --git a/tests/data/testGroupInvalidDim.xml b/tests/data/testGroupInvalidDim.xml
new file mode 100644
index 0000000..5ddce24
--- /dev/null
+++ b/tests/data/testGroupInvalidDim.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
+    <variable name="toto" shape="myDim" type="ushort">
+        <values>3</values>
+    </variable>
+    <dimension name="myDim"></dimension>
+</netcdf>
diff --git a/tests/test_parser.py b/tests/test_parser.py
index a0119bf..3cb0851 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -367,6 +367,11 @@ def test_read_group__conflicting_dims():
     assert 'index' in ds.gr_b_var.dims
 
 
+def test_read__invalid_dim():
+    with pytest.raises(ValueError, match="Unknown dimension 'myDim'.*"):
+        xncml.open_ncml(data / 'testGroupInvalidDim.xml')
+
+
 def test_flatten_groups():
     """Read every group and flatten everything in a single dataset/group."""
     ds = xncml.open_ncml(data / 'testGroup.xml', group='*')
diff --git a/xncml/parser.py b/xncml/parser.py
index dc3d144..2c5dd1f 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -534,7 +534,6 @@ def read_variable(
         shape = []
         for dim in obj.shape.split(' '):
             if dimensions.get(dim) is None:
-                # TODO: Add test for this branch, might be useless.
                 err = (
                     f"Unknown dimension '{dim}'."
                     ' Make sure it is declared before being used in the NCML.'
@@ -589,10 +588,6 @@ def read_variable(
         raise NotImplementedError
 
     var_name = obj.name
-    # TODO (abel): Fix for aggregation: we must override the variable and not create a new one in aggregation case
-    #               How to differentiate this case with variables in different groups that bear the same name ?
-    #               Should we carry a flag `override` from read_aggregation ?
-    #               Or when a flag to know we are in sub-groups ?
     if (
         (existing_var := target.get(var_name)) is not None
         and existing_var.attrs.get('group_path')

From 3243a011a74c86f2f964685d9ecbbddf8fff68ea Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Tue, 16 Jan 2024 13:55:03 +0100
Subject: [PATCH 05/12] DOC: document changes

---
 CHANGELOG.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1727dad..d69d0a6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,15 @@
+0.5.0 (unreleased)
+==================
+
+**Breaking changes**
+- Nested group handling:
+  Before this version, all groups were read, but conflicting variable names in-between groups would be cause a lost of data.
+  Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional `group` argument to specify which group should be read.
+  When ``group`` is missing, the root group is assumed.
+  Additionally ``group`` can be set to ``'*'``. In that case every group is read and the hierachy is flatten.
+  In the event of conflicting names, the variable/dimension name will be updated by appending '__n' where n is incremented.
+
+
 0.4.0 (2024-01-08)
 ==================
 

From 5daba4a05a0fe6ecebcb37c53876e337f7af1a17 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Tue, 16 Jan 2024 17:15:30 +0100
Subject: [PATCH 06/12] FIX: scalar parsing and nested groups

---
 tests/data/testGroupMultiLayers.xml |  8 +++++++-
 tests/test_parser.py                |  6 ++++--
 xncml/parser.py                     | 26 ++++++++++++++++++--------
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/tests/data/testGroupMultiLayers.xml b/tests/data/testGroupMultiLayers.xml
index 90b35eb..789fac4 100644
--- a/tests/data/testGroupMultiLayers.xml
+++ b/tests/data/testGroupMultiLayers.xml
@@ -2,8 +2,14 @@
 <netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
     <group name="gr_a">
         <dimension name="index" length="42"/>
-        <group name="sub_gr_a">
+        <group name="sub_gr">
             <variable name="gr_a_var" shape="index" type="ushort"></variable>
         </group>
     </group>
+    <group name="gr_b">
+        <dimension name="index" length="22"/>
+        <group name="sub_gr">
+            <variable name="gr_b_var" shape="index" type="ushort"></variable>
+        </group>
+    </group>
 </netcdf>
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 3cb0851..952343c 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -314,8 +314,8 @@ def test_empty_scalar__no_values_tag():
     filled, because numpy can't create an empty typed scalar.
     """
     ds = xncml.open_ncml(data / 'testEmptyScalar.xml')
-    assert ds['empty_scalar_var'].dtype == np.dtype('O')
-    assert ds['empty_scalar_var'].item() is None
+    assert ds['empty_scalar_var'].dtype == np.dtype('float64')
+    assert ds['empty_scalar_var'].item() == 0
 
 
 def test_empty_scalar__with_empty_values_tag():
@@ -396,6 +396,8 @@ def test_flatten_groups__sub_groups():
     """Read every group and rename dimensions"""
     ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*')
     assert ds.dims['index'] is not None
+    assert ds['gr_a_var'] is not None
+    assert ds['gr_b_var'] is not None
 
 
 # --- #
diff --git a/xncml/parser.py b/xncml/parser.py
index 2c5dd1f..53d4eeb 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -258,18 +258,27 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset:
         return xr.open_dataset(location, decode_times=False)
 
 
-def _get_leaf_groups(group: Netcdf | Group) -> list[str]:
+def _get_netcdf_leaves(netcdf: Netcdf) -> list[str]:
+    parent = ''
+    root_children = [child for child in netcdf.choice if isinstance(child, Group)]
+    if len(root_children) == 0:
+        return ['/']
+    leaves = []
+    for child in root_children:
+        leaves += list(_get_group_leaves(child, parent=parent))
+    return leaves
+
+
+def _get_group_leaves(group: Netcdf | Group, parent: str) -> str:
     group_children = [child for child in group.choice if isinstance(child, Group)]
     if len(group_children) == 0:
-        return [group.name]
-    leafs = []
+        yield f'{parent}/{group.name}'
     for child in group_children:
-        leafs += _get_leaf_groups(child)
-    return leafs
+        yield from _get_group_leaves(child, parent=f'{parent}/{group.name}')
 
 
 def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset:
-    leaf_groups = _get_leaf_groups(root_group)
+    leaf_groups = _get_netcdf_leaves(root_group)
     dims = {}
     enums = {}
     for leaf_group in leaf_groups:
@@ -649,7 +658,7 @@ def read_values(var_name: str, expected_size: int, values_tag: Values) -> list:
 
 
 def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> xr.Variable:
-    """Read values for <variable> element.
+    """Build an xr.Variable for scalar variables.
 
     Parameters
     ----------
@@ -676,7 +685,8 @@ def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> x
             ' <values> is empty. Provide a single values within <values></values>'
             ' to preserve the type.'
         )
-        return xr.Variable(data=None, dims=())
+        default_value = nctype(var_type)()
+        return xr.Variable(data=default_value, dims=())
     values_content = read_values(var_name, expected_size=1, values_tag=values_tag)
     if len(values_content) == 1:
         return xr.Variable(data=np.array(values_content[0], dtype=nctype(var_type))[()], dims=())

From 1959e9270a8489bcd084c2d0a05f6806dcf15a87 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Tue, 16 Jan 2024 17:18:13 +0100
Subject: [PATCH 07/12] remove dead code

---
 xncml/parser.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/xncml/parser.py b/xncml/parser.py
index 53d4eeb..ee686ae 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -354,16 +354,6 @@ def read_group(
     return target
 
 
-def _build_snake_name(prefix: str, suffix: str) -> str:
-    if prefix == ROOT_GROUP:
-        pre = ''
-    else:
-        pre = prefix.removeprefix('/')
-        pre = pre.replace('/', '_')
-        pre += '_'
-    return f'{pre}{suffix}'
-
-
 def read_scan(obj: Aggregation.Scan, ncml: Path) -> list[xr.Dataset]:
     """
     Return list of datasets defined in <scan> element.

From c32d43787631b3e00cb655eaec73a32601d4e8b9 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Fri, 2 Feb 2024 10:23:09 +0100
Subject: [PATCH 08/12] Improve generator

---
 xncml/parser.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/xncml/parser.py b/xncml/parser.py
index ee686ae..2c33b4e 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -36,6 +36,7 @@
 import datetime as dt
 from functools import partial
 from pathlib import Path
+from typing import TYPE_CHECKING
 from warnings import warn
 
 import numpy as np
@@ -57,6 +58,9 @@
     Variable,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
 __author__ = 'David Huard'
 __date__ = 'July 2022'
 __contact__ = 'huard.david@ouranos.ca'
@@ -258,30 +262,19 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset:
         return xr.open_dataset(location, decode_times=False)
 
 
-def _get_netcdf_leaves(netcdf: Netcdf) -> list[str]:
-    parent = ''
-    root_children = [child for child in netcdf.choice if isinstance(child, Group)]
-    if len(root_children) == 0:
-        return ['/']
-    leaves = []
-    for child in root_children:
-        leaves += list(_get_group_leaves(child, parent=parent))
-    return leaves
-
-
-def _get_group_leaves(group: Netcdf | Group, parent: str) -> str:
+def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[str]:
     group_children = [child for child in group.choice if isinstance(child, Group)]
+    current_path = '/' if parent is None else f'{parent}/{group.name}'
     if len(group_children) == 0:
-        yield f'{parent}/{group.name}'
+        yield current_path
     for child in group_children:
-        yield from _get_group_leaves(child, parent=f'{parent}/{group.name}')
+        yield from _get_leaves(child, parent=current_path)
 
 
 def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset:
-    leaf_groups = _get_netcdf_leaves(root_group)
     dims = {}
     enums = {}
-    for leaf_group in leaf_groups:
+    for leaf_group in _get_leaves(root_group):
         # Mutate target to add leaf_group to it
         read_group(target, ref, root_group, group_to_read=leaf_group, dims=dims, enums=enums)
     return target

From ff807efa4b35b1010da88d6223143b54b7ff80c3 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Fri, 2 Feb 2024 17:13:49 +0100
Subject: [PATCH 09/12] ENH: Improve group parsing

- improved performances by parsing ncml reprsentation only once
- fixed issues of rewritting content when read multiple times
---
 tests/data/testGroupMultiLayers.xml |  7 ++-
 tests/test_parser.py                |  9 ++--
 xncml/parser.py                     | 71 +++++++++++++++--------------
 3 files changed, 49 insertions(+), 38 deletions(-)

diff --git a/tests/data/testGroupMultiLayers.xml b/tests/data/testGroupMultiLayers.xml
index 789fac4..4d41694 100644
--- a/tests/data/testGroupMultiLayers.xml
+++ b/tests/data/testGroupMultiLayers.xml
@@ -1,15 +1,18 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <netcdf xmlns="http://www.unidata.ucar.edu/namespaces/netcdf/ncml-2.2">
+    <variable name="a_var" shape="" type="ushort">
+        <values>2</values>
+    </variable>
     <group name="gr_a">
         <dimension name="index" length="42"/>
         <group name="sub_gr">
-            <variable name="gr_a_var" shape="index" type="ushort"></variable>
+            <variable name="a_var" shape="index" type="ushort"></variable>
         </group>
     </group>
     <group name="gr_b">
         <dimension name="index" length="22"/>
         <group name="sub_gr">
-            <variable name="gr_b_var" shape="index" type="ushort"></variable>
+            <variable name="a_var" shape="index" type="ushort"></variable>
         </group>
     </group>
 </netcdf>
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 952343c..33a70bd 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -395,9 +395,12 @@ def test_flatten_groups__conflicting_dims():
 def test_flatten_groups__sub_groups():
     """Read every group and rename dimensions"""
     ds = xncml.open_ncml(data / 'testGroupMultiLayers.xml', group='*')
-    assert ds.dims['index'] is not None
-    assert ds['gr_a_var'] is not None
-    assert ds['gr_b_var'] is not None
+    assert ds.dims['index'] == 42
+    assert ds.dims['index__1'] == 22
+    assert ds['a_var'].size == 1
+    assert ds['a_var'] == 2
+    assert ds['a_var__1'].size == 42
+    assert ds['a_var__2'].size == 22
 
 
 # --- #
diff --git a/xncml/parser.py b/xncml/parser.py
index 2c33b4e..1ce4b3c 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -155,7 +155,7 @@ def read_netcdf(
     else:
         if not group.startswith('/'):
             group = f'/{group}'
-        target = read_group(target, ref, obj, group_to_read=group)
+        target = read_group(target, ref, obj, groups_to_read=[group])
     return target
 
 
@@ -228,7 +228,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat
     else:
         raise NotImplementedError
 
-    agg = read_group(agg, ref=None, obj=obj)
+    agg = read_group(agg, ref=None, obj=obj, groups_to_read=['/'])
     out = target.merge(agg, combine_attrs='no_conflicts')
     out.set_close(partial(_multi_file_closer, closers))
     return out
@@ -264,7 +264,7 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset:
 
 def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[str]:
     group_children = [child for child in group.choice if isinstance(child, Group)]
-    current_path = '/' if parent is None else f'{parent}/{group.name}'
+    current_path = '' if parent is None else f'{parent}/{group.name}'
     if len(group_children) == 0:
         yield current_path
     for child in group_children:
@@ -274,9 +274,8 @@ def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[st
 def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset:
     dims = {}
     enums = {}
-    for leaf_group in _get_leaves(root_group):
-        # Mutate target to add leaf_group to it
-        read_group(target, ref, root_group, group_to_read=leaf_group, dims=dims, enums=enums)
+    leaves_group = list(_get_leaves(root_group))
+    read_group(target, ref, root_group, groups_to_read=leaves_group, dims=dims, enums=enums)
     return target
 
 
@@ -284,8 +283,8 @@ def read_group(
     target: xr.Dataset,
     ref: xr.Dataset | None,
     obj: Group | Netcdf,
-    group_to_read: str = '/',
-    parent_group_path: str = '',
+    groups_to_read: list[str],
+    parent_group_path: str = '/',
     dims: dict = None,
     enums: dict = None,
 ) -> xr.Dataset:
@@ -328,14 +327,14 @@ def read_group(
         elif isinstance(item, EnumTypedef):
             enums[item.name] = read_enum(item)
         elif isinstance(item, Group):
-            if item.name in group_to_read:
+            if any(item.name in group_name for group_name in groups_to_read):
                 target = read_group(
                     target,
                     ref,
                     item,
-                    parent_group_path=f'{parent_group_path}/{item.name}',
+                    parent_group_path=f'{parent_group_path}{item.name}/',
                     dims=dims,
-                    group_to_read=group_to_read,
+                    groups_to_read=groups_to_read,
                 )
             else:
                 # ignore group
@@ -510,17 +509,22 @@ def read_variable(
     else:
         ref_var = None
 
+    var_name = obj.name
     # Read existing data or create empty DataArray
-    if obj.name in target or obj.name in target.dims:
-        out = xr.as_variable(target[obj.name])
+    if (existing_var := target.get(var_name)) is not None and existing_var.attrs.get(
+        'group_path'
+    ) in [None, group_path]:
+        out = xr.as_variable(target[var_name])
         if obj.type:
             out = out.astype(nctype(obj.type))
         ref_var = None
-    elif obj.name in ref or obj.name in ref.dims:
-        out = xr.as_variable(ref[obj.name])
+    elif (existing_var := ref.get(var_name)) is not None and existing_var.attrs.get(
+        'group_path'
+    ) in [None, group_path]:
+        out = xr.as_variable(ref[var_name])
         if obj.type:
             out = out.astype(nctype(obj.type))
-        ref_var = ref[obj.name]
+        ref_var = ref[var_name]
     elif obj.shape:
         var_dims = []
         shape = []
@@ -537,16 +541,15 @@ def read_variable(
             var_dims.append(dim)
         out = xr.Variable(data=np.empty(shape, dtype=nctype(obj.type)), dims=var_dims)
     elif obj.shape == '':
-        out = build_scalar_variable(var_name=obj.name, values_tag=obj.values, var_type=obj.type)
+        out = build_scalar_variable(var_name=var_name, values_tag=obj.values, var_type=obj.type)
     else:
-        error_msg = f'Could not build variable `{obj.name}`.'
+        error_msg = f'Could not build variable `{var_name }`.'
         raise ValueError(error_msg)
 
     # Set variable attributes
     for item in obj.attribute:
         read_attribute(out, item, ref=ref_var)
-    if group_path:
-        out.attrs['group_path'] = group_path
+    out.attrs['group_path'] = group_path
 
     # Remove attributes or dimensions
     for item in obj.remove:
@@ -554,7 +557,7 @@ def read_variable(
 
     # Read values for arrays (already done for a scalar)
     if obj.values and obj.shape != '':
-        data = read_values(obj.name, out.size, obj.values)
+        data = read_values(var_name, out.size, obj.values)
         data = out.dtype.type(data)
         out = xr.Variable(
             out.dims,
@@ -578,15 +581,16 @@ def read_variable(
         out.attrs['flag_meanings'] = enums[obj.typedef]['flag_meanings']
     elif obj.typedef is not None:
         raise NotImplementedError
+    import re
 
-    var_name = obj.name
-    if (
-        (existing_var := target.get(var_name)) is not None
-        and existing_var.attrs.get('group_path')
-        and existing_var.attrs.get('group_path') != group_path
-    ):
-        similar_var_count = len([v for v in target.variables.keys() if f'{var_name}__' in v])
-        var_name = f'{var_name}__{similar_var_count + 1}'
+    reg = re.compile(f'^{var_name}__|{var_name}')
+    similar_vars_but_diff_path = [
+        v
+        for v in target.data_vars
+        if reg.match(v) and target[v].attrs.get('group_path') not in [None, group_path]
+    ]
+    if len(similar_vars_but_diff_path) > 0:
+        var_name = f'{var_name}__{len(similar_vars_but_diff_path)}'
     target[var_name] = out
     return target
 
@@ -611,7 +615,8 @@ def read_values(var_name: str, expected_size: int, values_tag: Values) -> list:
     if values_tag.from_attribute is not None:
         error_msg = (
             'xncml cannot yet fetch values from a global or a '
-            f' variable attribute using <from_attribute>, here on variable {var_name}.'
+            ' variable attribute using <from_attribute>, here on variable'
+            f' {var_name}.'
         )
         raise NotImplementedError(error_msg)
     if values_tag.start is not None and values_tag.increment is not None:
@@ -663,12 +668,12 @@ def build_scalar_variable(var_name: str, values_tag: Values, var_type: str) -> x
       If the <values> tag is not a valid scalar.
     """
     if values_tag is None:
+        default_value = nctype(var_type)()
         warn(
-            f'Could not set the type for the scalar variable {var_name}, as its'
-            ' <values> is empty. Provide a single values within <values></values>'
+            f'The scalar variable {var_name} has no values set within'
+            f' <values></values>. A default value of {default_value} is set'
             ' to preserve the type.'
         )
-        default_value = nctype(var_type)()
         return xr.Variable(data=default_value, dims=())
     values_content = read_values(var_name, expected_size=1, values_tag=values_tag)
     if len(values_content) == 1:

From 18a7c8b2b54160d327fee870f35f2450c160c8a1 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Mon, 5 Feb 2024 13:45:11 +0100
Subject: [PATCH 10/12] Clean

---
 tests/test_parser.py | 8 ++++----
 xncml/parser.py      | 7 ++++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index 33a70bd..9676daa 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -310,8 +310,8 @@ def test_unsigned_type():
 
 def test_empty_scalar__no_values_tag():
     """
-    A scalar without a <values> tag will not be typed, even if the 'type' attribute is
-    filled, because numpy can't create an empty typed scalar.
+    A scalar variable which <values> is missing will have its value set to
+    the default value of its type.
     """
     ds = xncml.open_ncml(data / 'testEmptyScalar.xml')
     assert ds['empty_scalar_var'].dtype == np.dtype('float64')
@@ -319,13 +319,13 @@ def test_empty_scalar__no_values_tag():
 
 
 def test_empty_scalar__with_empty_values_tag():
-    """A scalar with an empty in its <values> tag is invalid."""
+    """A scalar with an empty <values> tag is invalid."""
     with pytest.raises(ValueError, match='No values found for variable .*'):
         xncml.open_ncml(data / 'testEmptyScalar_withValuesTag.xml')
 
 
 def test_multiple_values_for_scalar():
-    """a scalar with multiple values in its <values> tag is invalid."""
+    """A scalar with multiple values in its <values> tag is invalid."""
     with pytest.raises(ValueError, match='The expected size for variable .* was 1, .*'):
         xncml.open_ncml(data / 'testEmptyScalar_withMultipleValues.xml')
 
diff --git a/xncml/parser.py b/xncml/parser.py
index b7649ee..8b20cd6 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -86,7 +86,7 @@ def parse(path: Path) -> Netcdf:
     return parser.from_path(path, Netcdf)
 
 
-def open_ncml(ncml: str | Path, group: str = '/') -> xr.Dataset:
+def open_ncml(ncml: str | Path, group: str = ROOT_GROUP) -> xr.Dataset:
     """
     Convert NcML document to a dataset.
 
@@ -227,7 +227,7 @@ def read_aggregation(target: xr.Dataset, obj: Aggregation, ncml: Path) -> xr.Dat
     else:
         raise NotImplementedError
 
-    agg = read_group(agg, ref=None, obj=obj, groups_to_read=['/'])
+    agg = read_group(agg, ref=None, obj=obj, groups_to_read=[ROOT_GROUP])
     out = target.merge(agg, combine_attrs='no_conflicts')
     out.set_close(partial(_multi_file_closer, closers))
     return out
@@ -283,7 +283,7 @@ def read_group(
     ref: xr.Dataset | None,
     obj: Group | Netcdf,
     groups_to_read: list[str],
-    parent_group_path: str = '/',
+    parent_group_path: str = ROOT_GROUP,
     dims: dict = None,
     enums: dict = None,
 ) -> xr.Dataset:
@@ -311,6 +311,7 @@ def read_group(
     dims = {} if dims is None else dims
     enums = {} if enums is None else enums
     for item in obj.choice:
+
         if isinstance(item, Dimension):
             dim_name = item.name
             if dims.get(dim_name):

From ae842fe9378da8985f613fca7abd21c500d0ee52 Mon Sep 17 00:00:00 2001
From: Abel Aoun <aoun.abel@gmail.com>
Date: Mon, 5 Feb 2024 18:10:57 +0100
Subject: [PATCH 11/12] Apply suggestions from code review

Co-authored-by: David Huard <huard.david@ouranos.ca>
---
 CHANGELOG.md    |  6 +-----
 xncml/parser.py | 10 +++++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d69d0a6..68f0d9c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,11 +3,7 @@
 
 **Breaking changes**
 - Nested group handling:
-  Before this version, all groups were read, but conflicting variable names in-between groups would be cause a lost of data.
-  Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional `group` argument to specify which group should be read.
-  When ``group`` is missing, the root group is assumed.
-  Additionally ``group`` can be set to ``'*'``. In that case every group is read and the hierachy is flatten.
-  In the event of conflicting names, the variable/dimension name will be updated by appending '__n' where n is incremented.
+  Before this version, all groups were read, but conflicting variable names in-between groups would shadow data.  Now, similarly to xarray ``open_dataset``, ``open_ncml`` accepts an optional ``group`` argument to specify which group should be read. When ``group`` is not specified, it defaults to the root group. Additionally ``group`` can be set to ``'*'`` so that every group is read and the hierarchy is flattened.   In the event of conflicting variable/dimension names across groups, the conflicting name will be modified by appending ``'__n'`` where n is incremented.
 
 
 0.4.0 (2024-01-08)
diff --git a/xncml/parser.py b/xncml/parser.py
index 8b20cd6..320b3de 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -60,7 +60,7 @@
 if TYPE_CHECKING:
     from collections.abc import Iterator
 
-__author__ = 'David Huard'
+__author__ = 'David Huard, Abel Aoun'
 __date__ = 'July 2022'
 __contact__ = 'huard.david@ouranos.ca'
 
@@ -96,7 +96,7 @@ def open_ncml(ncml: str | Path, group: str = ROOT_GROUP) -> xr.Dataset:
       Path to NcML file.
     group : str
       Path of the group to parse within the ncml.
-      The special value ``*`` opens every group and flatten the variables into a single
+      The special value ``*`` opens every group and flattens the variables into a single
       dataset, renaming variables and dimensions if conflicting names are found.
 
     Returns
@@ -129,7 +129,7 @@ def read_netcdf(
       Path to NcML document, sometimes required to follow relative links.
     group : str
       Path of the group to parse within the ncml.
-      The special value ``*`` opens every group and flatten the variables into a single
+      The special value ``*`` opens every group and flattens the variables into a single
       dataset.
 
     Returns
@@ -488,9 +488,9 @@ def read_variable(
     dimensions : dict
       Dimension attributes keyed by name.
     enums: dict[str, dict]
-      The enums types that have been read in the upper groups.
+      The enums types that have been read in the parent groups.
     group_path: str
-      Path to the parent group
+      Path to the parent group.
 
     Returns
     -------

From dac587080c70712b066b34c5482fbe4f8ac5ad25 Mon Sep 17 00:00:00 2001
From: Abel Aoun <abel.aoun@external.eumetsat.int>
Date: Mon, 5 Feb 2024 18:18:55 +0100
Subject: [PATCH 12/12] ENH: Improve parser following code review

---
 xncml/parser.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/xncml/parser.py b/xncml/parser.py
index 320b3de..a71f249 100644
--- a/xncml/parser.py
+++ b/xncml/parser.py
@@ -150,7 +150,7 @@ def read_netcdf(
     for item in filter_by_class(obj.choice, Aggregation):
         target = read_aggregation(target, item, ncml)
     if group == FLATTEN_GROUPS:
-        target = flatten_groups(target, ref, obj)
+        target = _flatten_groups(target, ref, obj)
     else:
         if not group.startswith('/'):
             group = f'/{group}'
@@ -263,14 +263,14 @@ def read_ds(obj: Netcdf, ncml: Path) -> xr.Dataset:
 
 def _get_leaves(group: Netcdf | Group, parent: str | None = None) -> Iterator[str]:
     group_children = [child for child in group.choice if isinstance(child, Group)]
-    current_path = '' if parent is None else f'{parent}/{group.name}'
+    current_path = ROOT_GROUP if parent is None else f'{parent}{group.name}/'
     if len(group_children) == 0:
         yield current_path
     for child in group_children:
         yield from _get_leaves(child, parent=current_path)
 
 
-def flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset:
+def _flatten_groups(target: xr.Dataset, ref: xr.Dataset, root_group: Netcdf) -> xr.Dataset:
     dims = {}
     enums = {}
     leaves_group = list(_get_leaves(root_group))
@@ -298,8 +298,10 @@ def read_group(
       Reference dataset used to copy content into `target`.
     obj : Group | Netcdf
       <netcdf> object description.
-    parent_group : str
-      Path of the group to parse within the ncml.
+    groups_to_read : list[str]
+      List of groups that must be read and included in `target`.
+    parent_group_path : str
+      Path of parent group, by default the root group '/'.
     dims: dict[str, Dimension]
       Dictionary of the dimensions of this dataset.
 
@@ -311,7 +313,6 @@ def read_group(
     dims = {} if dims is None else dims
     enums = {} if enums is None else enums
     for item in obj.choice:
-
         if isinstance(item, Dimension):
             dim_name = item.name
             if dims.get(dim_name):