Merge pull request #164 from ericpre/markers_update

Update of HyperSpy Markers API changes for the `hspy`/`zspy` format
hyperspy · Oct 5, 2023 · 42574d2 · 42574d2
2 parents e4e71ad + 0255321
commit 42574d2
Show file tree

Hide file tree

Showing 9 changed files with 202 additions and 129 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -20,7 +20,8 @@ jobs:
           - os: ubuntu
             PYTHON_VERSION: '3.8'
             # Set pillow and scikit-image version to be compatible with imageio and scipy
-            DEPENDENCIES: matplotlib==3.1.3 numpy==1.20.0 scipy==1.5 imagecodecs==2020.1.31 tifffile==2020.2.16 dask[array]==2021.3.1 numba==0.52 imageio==2.16 pillow==8.3.2 scikit-image==0.18.0
+            # matplotlib needs 3.5 to support markers in hyperspy 2.0 (requires `collection.set_offset_transform`)
+            DEPENDENCIES: matplotlib==3.5 numpy==1.20.0 scipy==1.5 imagecodecs==2020.1.31 tifffile==2020.2.16 dask[array]==2021.3.1 numba==0.52 imageio==2.16 pillow==8.3.2 scikit-image==0.18.0
             LABEL: '-oldest'
           # test minimum requirement
           - os: ubuntu

diff --git a/docs/supported_formats/hspy.rst b/docs/supported_formats/hspy.rst
@@ -141,6 +141,12 @@ the experiments and that will be accessible as attributes of the
 Changelog
 ^^^^^^^^^
 
+v3.3
+""""
+- Rename ``ragged_shapes`` dataset to ``_ragged_shapes_{key}`` where the ``key``
+  is the name of the corresponding ragged ``dataset``.
+
+
 v3.2
 """"
 - Deprecated ``record_by`` attribute is removed

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,7 @@ file = "COPYING.txt"
 [project.optional-dependencies]
 blockfile = ["scikit-image>=0.18"]
 mrcz = ["blosc>=1.5", "mrcz>=0.3.6"]
-scalebar_export = ["matplotlib-scalebar", "matplotlib>=3.1.3"]
+scalebar_export = ["matplotlib-scalebar", "matplotlib>=3.5"]
 tiff = ["tifffile>=2020.2.16", "imagecodecs>=2020.1.31"]
 # Add sidpy dependency and pinning as workaround to fix pyUSID import
 # Remove sidpy dependency once https://github.com/pycroscopy/pyUSID/issues/85 is fixed.

diff --git a/rsciio/_hierarchical.py b/rsciio/_hierarchical.py
@@ -29,7 +29,7 @@
 from rsciio.utils.tools import ensure_unicode
 
 
-version = "3.2"
+version = "3.3"
 
 default_version = Version(version)
 
@@ -225,6 +225,26 @@ def read(self, lazy):
 
         return exp_dict_list
 
+    @staticmethod
+    def _read_array(group, dataset_key):
+        # This is a workaround for the lack of support for n-d ragged array
+        # in h5py and zarr. There is work in progress for implementation in zarr:
+        # https://github.com/zarr-developers/zarr-specs/issues/62 which may be
+        # relevant to implement here when available
+        data = group[dataset_key]
+        key = f"_ragged_shapes_{dataset_key}"
+        if "ragged_shapes" in group:
+            # For file saved with rosettaSciIO <= 0.1
+            # rename from `ragged_shapes` to `_ragged_shapes_{key}` in v3.3
+            key = "ragged_shapes"
+        if key in group:
+            ragged_shape = group[key]
+            new_data = np.empty(shape=data.shape, dtype=object)
+            for i in np.ndindex(data.shape):
+                new_data[i] = np.reshape(data[i], ragged_shape[i])
+            data = new_data
+        return data
+
     def group2signaldict(self, group, lazy=False):
         """
         Reads a h5py/zarr group and returns a signal dictionary.
@@ -253,8 +273,12 @@ def group2signaldict(self, group, lazy=False):
         exp = {
             "metadata": self._group2dict(group[metadata], lazy=lazy),
             "original_metadata": self._group2dict(group[original_metadata], lazy=lazy),
-            "attributes": {},
         }
+        if "attributes" in group:
+            # RosettaSciIO version is > 0.1
+            exp["attributes"] = self._group2dict(group["attributes"], lazy=lazy)
+        else:
+            exp["attributes"] = {}
         if "package" in group.attrs:
             # HyperSpy version is >= 1.5
             exp["package"] = group.attrs["package"]
@@ -266,20 +290,13 @@ def group2signaldict(self, group, lazy=False):
             exp["package"] = ""
             exp["package_version"] = ""
 
-        data = group["data"]
-        try:
-            ragged_shape = group["ragged_shapes"]
-            new_data = np.empty(shape=data.shape, dtype=object)
-            for i in np.ndindex(data.shape):
-                new_data[i] = np.reshape(data[i], ragged_shape[i])
-            data = new_data
-        except KeyError:
-            pass
+        data = self._read_array(group, "data")
         if lazy:
             data = da.from_array(data, chunks=data.chunks)
             exp["attributes"]["_lazy"] = True
         else:
             data = np.asanyarray(data)
+            exp["attributes"]["_lazy"] = False
         exp["data"] = data
         axes = []
         for i in range(len(exp["data"].shape)):
@@ -514,21 +531,22 @@ def _group2dict(self, group, dictionary=None, lazy=False):
                 dictionary[key] = value
         if not isinstance(group, self.Dataset):
             for key in group.keys():
-                if key.startswith("_sig_"):
+                if key.startswith("_ragged_shapes_"):
+                    # array used to parse ragged array, need to skip it
+                    # otherwise, it will wrongly read kwargs when reading
+                    # variable length markers as they uses ragged arrays
+                    pass
+                elif key.startswith("_sig_"):
                     dictionary[key] = self.group2signaldict(group[key])
                 elif isinstance(group[key], self.Dataset):
-                    dat = group[key]
+                    dat = self._read_array(group, key)
                     kn = key
                     if key.startswith("_list_"):
-                        if h5py.check_string_dtype(dat.dtype) and hasattr(dat, "asstr"):
-                            # h5py 3.0 and newer
-                            # https://docs.h5py.org/en/3.0.0/strings.html
-                            dat = dat.asstr()[:]
-                        ans = np.array(dat)
+                        ans = self._parse_iterable(dat)
                         ans = ans.tolist()
                         kn = key[6:]
                     elif key.startswith("_tuple_"):
-                        ans = np.array(dat)
+                        ans = self._parse_iterable(dat)
                         ans = tuple(ans.tolist())
                         kn = key[7:]
                     elif dat.dtype.char == "S":
@@ -574,6 +592,14 @@ def _group2dict(self, group, dictionary=None, lazy=False):
 
         return dictionary
 
+    @staticmethod
+    def _parse_iterable(data):
+        if h5py.check_string_dtype(data.dtype) and hasattr(data, "asstr"):
+            # h5py 3.0 and newer
+            # https://docs.h5py.org/en/3.0.0/strings.html
+            data = data.asstr()[:]
+        return np.array(data)
+
 
 class HierarchicalWriter:
     """
@@ -687,10 +713,10 @@ def overwrite_dataset(cls, group, data, key, signal_axes=None, chunks=None, **kw
                 new_data[i] = data[i].ravel()
                 shapes[i] = np.array(data[i].shape)
             shape_dset = cls._get_object_dset(
-                group, shapes, "ragged_shapes", shapes.shape, **kwds
+                group, shapes, f"_ragged_shapes_{key}", shapes.shape, **kwds
             )
             cls._store_data(
-                shapes, shape_dset, group, "ragged_shapes", chunks=shapes.shape
+                shapes, shape_dset, group, f"_ragged_shapes_{key}", chunks=shapes.shape
             )
             cls._store_data(new_data, dset, group, key, chunks)
         else:
@@ -738,6 +764,8 @@ def write_signal(self, signal, group, write_dataset=True, chunks=None, **kwds):
         self.dict2group(signal["original_metadata"], original_par, **kwds)
         learning_results = group.require_group("learning_results")
         self.dict2group(signal["learning_results"], learning_results, **kwds)
+        attributes = group.require_group("attributes")
+        self.dict2group(signal["attributes"], attributes, **kwds)
 
         if signal["models"]:
             model_group = self.file.require_group("Analysis/models")

diff --git a/rsciio/hspy/_api.py b/rsciio/hspy/_api.py
@@ -87,7 +87,7 @@ def _get_object_dset(group, data, key, chunks, **kwds):
         if chunks is None:
             chunks = 1
         dset = group.require_dataset(
-            key, chunks, dtype=h5py.special_dtype(vlen=data[0].dtype), **kwds
+            key, chunks, dtype=h5py.special_dtype(vlen=data.flatten()[0].dtype), **kwds
         )
         return dset