diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 137ee188..d1fb0d7f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,18 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v5.0.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace + - id: check-added-large-files + - id: check-json + - id: check-toml + - id: name-tests-test + args: [--pytest-test-first] + - id: check-docstring-first - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 24.10.0 hooks: - id: black exclude: ^docs/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 639958cf..453504a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ * Remove support for python 3.8 and added testing for Python 3.13. @mavaylon1 [#240](https://github.com/hdmf-dev/hdmf-zarr/pull/240) * Added `NWBZarrIO.read_nwb` convenience method to simplify reading an NWB file. @oruebel [#226](https://github.com/hdmf-dev/hdmf-zarr/pull/226) * Updated optional dependency groups in `pyproject.toml` and GitHub Actions workflows. @rly, @mavaylon1 [#239](https://github.com/hdmf-dev/hdmf-zarr/pull/239) +* Applied black code formatter. @rly [#247](https://github.com/hdmf-dev/hdmf-zarr/pull/247) ### Bug Fixes * Fix reading of cached specs and caching of specs during export. @rly [#232](https://github.com/hdmf-dev/hdmf-zarr/pull/232) diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index de5b2302..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include LICENSE.txt versioneer.py src/hdmf_zarr/_version.py src/hdmf_zarr/_due.py -include requirements.txt requirements-dev.txt requirements-doc.txt requirements-opt.txt -include test.py tox.ini -graft tests diff --git a/pyproject.toml b/pyproject.toml index 25adfcce..65be5d24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,31 +109,9 @@ omit = [ [tool.black] line-length = 120 -target-version = ['py38'] +target-version = ['py313'] include = '\.pyi?$' -extend-exclude = ''' -/( - \.toml - |\.yml - |\.txt - |\.sh - |\.git - |\.ini - | \.hg - | \.mypy_cache - | \.tox - | \.venv - | build - | dist -)/ -''' -force-exclude = ''' -/( - /*.txt - /docs - /docs/* -)\ -''' +force-exclude = 'docs/*' [tool.ruff] lint.select = ["E", "F", "T100", "T201", "T203"] diff --git a/src/hdmf_zarr/__init__.py b/src/hdmf_zarr/__init__.py index 6a33ab4b..805efb0e 100644 --- a/src/hdmf_zarr/__init__.py +++ b/src/hdmf_zarr/__init__.py @@ -12,9 +12,14 @@ __version__ = version("hdmf") del version +__all__ = ["ZarrIO", "ZarrDataIO", "NWBZarrIO"] + # Duecredit definitions from ._due import due, BibTeX # noqa: E402 -due.cite(BibTeX(""" + +due.cite( + BibTeX( + """ @INPROCEEDINGS{9005648, author={A. J. {Tritt} and O. {RĂ¼bel} and B. {Dichter} and R. {Ly} and D. {Kang} and E. F. {Chang} and L. M. {Frank} and K. {Bouchard}}, booktitle={2019 IEEE International Conference on Big Data (Big Data)}, @@ -24,6 +29,11 @@ number={}, pages={165-179}, doi={10.1109/BigData47090.2019.9005648}} -"""), description="HDMF: Hierarchical Data Modeling Framework for Modern Science Data Standards", # noqa: E501 - path="hdmf/", version=__version__, cite_module=True) +""" # noqa: E501 + ), + description="HDMF: Hierarchical Data Modeling Framework for Modern Science Data Standards", + path="hdmf/", + version=__version__, + cite_module=True, +) del due, BibTeX diff --git a/src/hdmf_zarr/_due.py b/src/hdmf_zarr/_due.py index f729f843..f5d20fe9 100644 --- a/src/hdmf_zarr/_due.py +++ b/src/hdmf_zarr/_due.py @@ -1,6 +1,10 @@ # emacs: at the end of the file # ex: set sts=4 ts=4 sw=4 et: # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### # +from __future__ import annotations + +from typing import Any + """ Stub file for a guaranteed safe import of duecredit constructs: if duecredit @@ -24,46 +28,49 @@ License: BSD-2 """ -__version__ = '0.0.9' +__version__ = "0.0.9" -class InactiveDueCreditCollector(object): +class InactiveDueCreditCollector: """Just a stub at the Collector which would not do anything""" - def _donothing(self, *args, **kwargs): + + def _donothing(self, *_args: Any, **_kwargs: Any) -> None: """Perform no good and no bad""" pass - def dcite(self, *args, **kwargs): + def dcite(self, *_args: Any, **_kwargs: Any): """If I could cite I would""" + def nondecorating_decorator(func): return func + return nondecorating_decorator active = False activate = add = cite = dump = load = _donothing - def __repr__(self): - return self.__class__.__name__ + '()' + def __repr__(self) -> str: + return self.__class__.__name__ + "()" -def _donothing_func(*args, **kwargs): +def _donothing_func(*args: Any, **kwargs: Any) -> None: """Perform no good and no bad""" pass try: - from duecredit import due, BibTeX, Doi, Url, Text # lgtm [py/unused-import] - if 'due' in locals() and not hasattr(due, 'cite'): - raise RuntimeError( - "Imported due lacks .cite. DueCredit is now disabled") + from duecredit import BibTeX, Doi, Text, Url, due # lgtm [py/unused-import] + + if "due" in locals() and not hasattr(due, "cite"): + raise RuntimeError("Imported due lacks .cite. DueCredit is now disabled") except Exception as e: if not isinstance(e, ImportError): import logging - logging.getLogger("duecredit").error( - "Failed to import duecredit due to %s" % str(e)) + + logging.getLogger("duecredit").error("Failed to import duecredit due to %s" % str(e)) # Initiate due stub - due = InactiveDueCreditCollector() - BibTeX = Doi = Url = Text = _donothing_func + due = InactiveDueCreditCollector() # type: ignore + BibTeX = Doi = Url = Text = _donothing_func # type: ignore # Emacs mode definitions # Local Variables: diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index e209b8c0..5477e533 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -1,4 +1,5 @@ """Module with the Zarr-based I/O-backend for HDMF""" + # Python imports import os import shutil @@ -11,41 +12,21 @@ import zarr from zarr.hierarchy import Group from zarr.core import Array -from zarr.storage import (DirectoryStore, - TempStore, - NestedDirectoryStore, - ConsolidatedMetadataStore) +from zarr.storage import DirectoryStore, TempStore, NestedDirectoryStore, ConsolidatedMetadataStore import numcodecs # HDMF-ZARR imports -from .utils import (ZarrDataIO, - ZarrReference, - ZarrSpecWriter, - ZarrSpecReader, - ZarrIODataChunkIteratorQueue) +from .utils import ZarrDataIO, ZarrReference, ZarrSpecWriter, ZarrSpecReader, ZarrIODataChunkIteratorQueue from .zarr_utils import BuilderZarrReferenceDataset, BuilderZarrTableDataset # HDMF imports from hdmf.backends.io import HDMFIO from hdmf.backends.errors import UnsupportedOperation -from hdmf.backends.utils import (NamespaceToBuilderHelper, - WriteStatusTracker) -from hdmf.utils import (docval, - getargs, - popargs, - get_docval, - get_data_shape) -from hdmf.build import (Builder, - GroupBuilder, - DatasetBuilder, - LinkBuilder, - BuildManager, - ReferenceBuilder, - TypeMap) +from hdmf.backends.utils import NamespaceToBuilderHelper, WriteStatusTracker +from hdmf.utils import docval, getargs, popargs, get_docval, get_data_shape +from hdmf.build import Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager, ReferenceBuilder, TypeMap from hdmf.data_utils import AbstractDataChunkIterator -from hdmf.spec import (RefSpec, - DtypeSpec, - NamespaceCatalog) +from hdmf.spec import RefSpec, DtypeSpec, NamespaceCatalog from hdmf.query import HDMFDataset from hdmf.container import Container @@ -53,24 +34,22 @@ # Module variables -ROOT_NAME = 'root' +ROOT_NAME = "root" """ Name of the root builder for read/write """ -SPEC_LOC_ATTR = '.specloc' +SPEC_LOC_ATTR = ".specloc" """ Reserved attribute storing the path to the Group where the schema for the file are cached """ -DEFAULT_SPEC_LOC_DIR = 'specifications' +DEFAULT_SPEC_LOC_DIR = "specifications" """ Default name of the group where specifications should be cached """ -SUPPORTED_ZARR_STORES = (DirectoryStore, - TempStore, - NestedDirectoryStore) +SUPPORTED_ZARR_STORES = (DirectoryStore, TempStore, NestedDirectoryStore) """ Tuple listing all Zarr storage backends supported by ZarrIO """ @@ -87,34 +66,69 @@ def can_read(path): except Exception: return False - @docval({'name': 'path', - 'type': (str, Path, *SUPPORTED_ZARR_STORES), - 'doc': 'the path to the Zarr file or a supported Zarr store'}, - {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None}, - {'name': 'mode', 'type': str, - 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "r-"). ' - 'the mode r- is used to force open without consolidated metadata in read only mode.'}, - {'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool), - 'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.', - 'default': None}, - {'name': 'object_codec_class', 'type': None, - 'doc': 'Set the numcodec object codec class to be used to encode objects.' - 'Use numcodecs.pickles.Pickle by default.', - 'default': None}, - {'name': 'storage_options', 'type': dict, - 'doc': 'Zarr storage options to read remote folders', - 'default': None}, - {'name': 'force_overwrite', - 'type': bool, - 'doc': "force overwriting existing object when in 'w' mode. The existing file or directory" - " will be deleted when before opening (even if the object is not Zarr, e.g,. an HDF5 file)", - 'default': False} - ) + @docval( + { + "name": "path", + "type": (str, Path, *SUPPORTED_ZARR_STORES), + "doc": "the path to the Zarr file or a supported Zarr store", + }, + { + "name": "manager", + "type": BuildManager, + "doc": "the BuildManager to use for I/O", + "default": None, + }, + { + "name": "mode", + "type": str, + "doc": ( + 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "r-"). ' + "the mode r- is used to force open without consolidated metadata in read only mode." + ), + }, + { + "name": "synchronizer", + "type": (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool), + "doc": "Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.", + "default": None, + }, + { + "name": "object_codec_class", + "type": None, + "doc": ( + "Set the numcodec object codec class to be used to encode objects." + "Use numcodecs.pickles.Pickle by default." + ), + "default": None, + }, + { + "name": "storage_options", + "type": dict, + "doc": "Zarr storage options to read remote folders", + "default": None, + }, + { + "name": "force_overwrite", + "type": bool, + "doc": ( + "force overwriting existing object when in 'w' mode. The existing file or directory" + " will be deleted when before opening (even if the object is not Zarr, e.g,. an HDF5 file)" + ), + "default": False, + }, + ) def __init__(self, **kwargs): - self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) + self.logger = logging.getLogger("%s.%s" % (self.__class__.__module__, self.__class__.__qualname__)) path, manager, mode, synchronizer, object_codec_class, storage_options, force_overwrite = popargs( - 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', - 'storage_options', 'force_overwrite', kwargs) + "path", + "manager", + "mode", + "synchronizer", + "object_codec_class", + "storage_options", + "force_overwrite", + kwargs, + ) if manager is None: manager = BuildManager(TypeMap(NamespaceCatalog())) if isinstance(synchronizer, bool): @@ -184,31 +198,35 @@ def open(self): if self.__file is None: # Allow overwriting an existing file (e.g., an HDF5 file). Zarr will normally fail if the # existing object at the path is a file. So if we are in `w` mode we need to delete the file first - if self.mode == 'w' and self.__force_overwrite: + if self.mode == "w" and self.__force_overwrite: if isinstance(self.path, (str, Path)) and os.path.exists(self.path): - if os.path.isdir(self.path): # directory + if os.path.isdir(self.path): # directory shutil.rmtree(self.path) else: # File os.remove(self.path) # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. # As a result, when in other modes, the file will not use consolidated metadata. - if self.mode != 'r': + if self.mode != "r": # When we consolidate metadata, we use ConsolidatedMetadataStore. # This interface does not allow for setting items. # In the doc string, it says it is "read only". As a result, we cannot use r+ with consolidate_metadata. # r- is only an internal mode in ZarrIO to force the use of regular open. For Zarr we need to # use the regular mode r when r- is specified - mode_to_use = self.mode if self.mode != 'r-' else 'r' - self.__file = zarr.open(store=self.path, - mode=mode_to_use, - synchronizer=self.__synchronizer, - storage_options=self.__storage_options) + mode_to_use = self.mode if self.mode != "r-" else "r" + self.__file = zarr.open( + store=self.path, + mode=mode_to_use, + synchronizer=self.__synchronizer, + storage_options=self.__storage_options, + ) else: - self.__file = self.__open_file_consolidated(store=self.path, - mode=self.mode, - synchronizer=self.__synchronizer, - storage_options=self.__storage_options) + self.__file = self.__open_file_consolidated( + store=self.path, + mode=self.mode, + synchronizer=self.__synchronizer, + storage_options=self.__storage_options, + ) def close(self): """Close the Zarr file""" @@ -218,29 +236,38 @@ def close(self): def is_remote(self): """Return True if the file is remote, False otherwise""" from zarr.storage import FSStore + if isinstance(self.file.store, FSStore): return True else: return False @classmethod - @docval({'name': 'namespace_catalog', - 'type': (NamespaceCatalog, TypeMap), - 'doc': 'the NamespaceCatalog or TypeMap to load namespaces into'}, - {'name': 'path', - 'type': (str, Path, *SUPPORTED_ZARR_STORES), - 'doc': 'the path to the Zarr file or a supported Zarr store'}, - {'name': 'storage_options', 'type': dict, - 'doc': 'Zarr storage options to read remote folders', - 'default': None}, - {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None} - ) + @docval( + { + "name": "namespace_catalog", + "type": (NamespaceCatalog, TypeMap), + "doc": "the NamespaceCatalog or TypeMap to load namespaces into", + }, + { + "name": "path", + "type": (str, Path, *SUPPORTED_ZARR_STORES), + "doc": "the path to the Zarr file or a supported Zarr store", + }, + { + "name": "storage_options", + "type": dict, + "doc": "Zarr storage options to read remote folders", + "default": None, + }, + {"name": "namespaces", "type": list, "doc": "the namespaces to load", "default": None}, + ) def load_namespaces(cls, namespace_catalog, path, storage_options, namespaces=None): - ''' + """ Load cached namespaces from a file. - ''' + """ # TODO: how to use storage_options here? - f = zarr.open(path, mode='r', storage_options=storage_options) + f = zarr.open(path, mode="r", storage_options=storage_options) if SPEC_LOC_ATTR not in f.attrs: msg = "No cached namespaces found in %s" % path warnings.warn(msg) @@ -253,17 +280,26 @@ def load_namespaces(cls, namespace_catalog, path, storage_options, namespaces=No latest_version = list(ns_group.keys())[-1] ns_group = ns_group[latest_version] reader = ZarrSpecReader(ns_group) - namespace_catalog.load_namespaces('namespace', reader=reader) + namespace_catalog.load_namespaces("namespace", reader=reader) @docval( - {'name': 'container', 'type': Container, 'doc': 'the Container object to write'}, - {'name': 'cache_spec', 'type': bool, 'doc': 'cache specification to file', 'default': True}, - {'name': 'link_data', 'type': bool, - 'doc': 'If not specified otherwise link (True) or copy (False) Datasets', 'default': True}, - {'name': 'exhaust_dci', 'type': bool, - 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + - 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', - 'default': True}, + {"name": "container", "type": Container, "doc": "the Container object to write"}, + {"name": "cache_spec", "type": bool, "doc": "cache specification to file", "default": True}, + { + "name": "link_data", + "type": bool, + "doc": "If not specified otherwise link (True) or copy (False) Datasets", + "default": True, + }, + { + "name": "exhaust_dci", + "type": bool, + "doc": ( + "exhaust DataChunkIterators one at a time. If False, add " + "them to the internal queue self.__dci_queue and exhaust them concurrently at the end" + ), + "default": True, + }, { "name": "number_of_jobs", "type": int, @@ -276,9 +312,7 @@ def load_namespaces(cls, namespace_catalog, path, storage_options, namespaces=No { "name": "max_threads_per_process", "type": int, - "doc": ( - "Limits the number of threads used by each process. The default is None (no limits)." - ), + "doc": ("Limits the number of threads used by each process. The default is None (no limits)."), "default": None, }, { @@ -293,11 +327,9 @@ def load_namespaces(cls, namespace_catalog, path, storage_options, namespaces=No { "name": "consolidate_metadata", "type": bool, - "doc": ( - "Consolidate metadata into a single .zmetadata file in the root group to accelerate read." - ), + "doc": ("Consolidate metadata into a single .zmetadata file in the root group to accelerate read."), "default": True, - } + }, ) def write(self, **kwargs): """Overwrite the write method to add support for caching the specification and parallelization.""" @@ -330,16 +362,16 @@ def __cache_spec(self): ns_builder = NamespaceToBuilderHelper.convert_namespace(ns_catalog, ns_name) namespace = ns_catalog.get_namespace(ns_name) if namespace.version is None: - group_name = '%s/unversioned' % ns_name + group_name = "%s/unversioned" % ns_name else: - group_name = '%s/%s' % (ns_name, namespace.version) + group_name = "%s/%s" % (ns_name, namespace.version) ns_group = spec_group.require_group(group_name) writer = ZarrSpecWriter(ns_group) - ns_builder.export('namespace', writer=writer) + ns_builder.export("namespace", writer=writer) @docval( *get_docval(HDMFIO.export), - {'name': 'cache_spec', 'type': bool, 'doc': 'whether to cache the specification to file', 'default': True}, + {"name": "cache_spec", "type": bool, "doc": "whether to cache the specification to file", "default": True}, { "name": "number_of_jobs", "type": int, @@ -352,9 +384,7 @@ def __cache_spec(self): { "name": "max_threads_per_process", "type": int, - "doc": ( - "Limits the number of threads used by each process. The default is None (no limits)." - ), + "doc": "Limits the number of threads used by each process. The default is None (no limits).", "default": None, }, { @@ -371,12 +401,13 @@ def export(self, **kwargs): """Export data read from a file from any backend to Zarr. See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details. """ - if self.mode != 'w': - raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'." - % (self.source, self.mode)) + if self.mode != "w": + raise UnsupportedOperation( + "Cannot export to file %s in mode '%s'. Please use mode 'w'." % (self.source, self.mode) + ) - src_io = getargs('src_io', kwargs) - write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs) + src_io = getargs("src_io", kwargs) + write_args, cache_spec = popargs("write_args", "cache_spec", kwargs) number_of_jobs, max_threads_per_process, multiprocessing_context = popargs( "number_of_jobs", "max_threads_per_process", "multiprocessing_context", kwargs ) @@ -387,24 +418,25 @@ def export(self, **kwargs): multiprocessing_context=multiprocessing_context, ) - if not isinstance(src_io, ZarrIO) and write_args.get('link_data', True): - raise UnsupportedOperation(f"Cannot export from non-Zarr backend { src_io.__class__.__name__} " + - "to Zarr with write argument link_data=True. " - + "Set write_args={'link_data': False}") + if not isinstance(src_io, ZarrIO) and write_args.get("link_data", True): + raise UnsupportedOperation( + f"Cannot export from non-Zarr backend { src_io.__class__.__name__} " + "to Zarr with write argument link_data=True. " + "Set write_args={'link_data': False}" + ) - write_args['export_source'] = src_io.source # pass export_source=src_io.source to write_builder + write_args["export_source"] = src_io.source # pass export_source=src_io.source to write_builder ckwargs = kwargs.copy() - ckwargs['write_args'] = write_args - if not write_args.get('link_data', True): - ckwargs['clear_cache'] = True + ckwargs["write_args"] = write_args + if not write_args.get("link_data", True): + ckwargs["clear_cache"] = True super().export(**ckwargs) if cache_spec: # add any namespaces from the src_io that have not yet been loaded for namespace in src_io.manager.namespace_catalog.namespaces: if namespace not in self.manager.namespace_catalog.namespaces: self.manager.namespace_catalog.add_namespace( - name=namespace, - namespace=src_io.manager.namespace_catalog.get_namespace(namespace) + name=namespace, namespace=src_io.manager.namespace_catalog.get_namespace(namespace) ) self.__cache_spec() @@ -426,61 +458,60 @@ def get_written(self, builder, check_on_disk=False): written = written and self.get_builder_exists_on_disk(builder=builder) return written - @docval({'name': 'builder', 'type': Builder, 'doc': 'The builder of interest'}) + @docval({"name": "builder", "type": Builder, "doc": "The builder of interest"}) def get_builder_exists_on_disk(self, **kwargs): """ Convenience function to check whether a given builder exists on disk in this Zarr file. """ - builder = getargs('builder', kwargs) + builder = getargs("builder", kwargs) builder_path = self.get_builder_disk_path(builder=builder, filepath=None) exists_on_disk = os.path.exists(builder_path) return exists_on_disk - @docval({'name': 'builder', 'type': Builder, 'doc': 'The builder of interest'}, - {'name': 'filepath', 'type': str, - 'doc': 'The path to the Zarr file or None for this file', 'default': None}) + @docval( + {"name": "builder", "type": Builder, "doc": "The builder of interest"}, + {"name": "filepath", "type": str, "doc": "The path to the Zarr file or None for this file", "default": None}, + ) def get_builder_disk_path(self, **kwargs): - builder, filepath = getargs('builder', 'filepath', kwargs) + builder, filepath = getargs("builder", "filepath", kwargs) basepath = filepath if filepath is not None else self.source builder_path = os.path.join(basepath, self.__get_path(builder).lstrip("/")) return builder_path @docval( - {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder object representing the NWBFile'}, + {"name": "builder", "type": GroupBuilder, "doc": "the GroupBuilder object representing the NWBFile"}, { - 'name': 'link_data', - 'type': bool, - 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', - 'default': True + "name": "link_data", + "type": bool, + "doc": "If not specified otherwise link (True) or copy (False) Zarr Datasets", + "default": True, }, { - 'name': 'exhaust_dci', - 'type': bool, - 'doc': ( - 'Exhaust DataChunkIterators one at a time. If False, add ' - 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end' + "name": "exhaust_dci", + "type": bool, + "doc": ( + "Exhaust DataChunkIterators one at a time. If False, add " + "them to the internal queue self.__dci_queue and exhaust them concurrently at the end" ), - 'default': True, + "default": True, }, { - 'name': 'export_source', - 'type': str, - 'doc': 'The source of the builders when exporting', - 'default': None, + "name": "export_source", + "type": str, + "doc": "The source of the builders when exporting", + "default": None, }, { "name": "consolidate_metadata", "type": bool, - "doc": ( - "Consolidate metadata into a single .zmetadata file in the root group to accelerate read." - ), + "doc": "Consolidate metadata into a single .zmetadata file in the root group to accelerate read.", "default": True, - } + }, ) def write_builder(self, **kwargs): """Write a builder to disk.""" f_builder, link_data, exhaust_dci, export_source, consolidate_metadata = getargs( - 'builder', 'link_data', 'exhaust_dci', 'export_source', 'consolidate_metadata', kwargs + "builder", "link_data", "exhaust_dci", "export_source", "consolidate_metadata", kwargs ) for name, gbldr in f_builder.groups.items(): self.write_group( @@ -501,8 +532,9 @@ def write_builder(self, **kwargs): self.write_attributes(self.__file, f_builder.attributes) # the same as set_attributes in HDMF self.__dci_queue.exhaust_queue() # Write any remaining DataChunkIterators that have been queued self._written_builders.set_written(f_builder) - self.logger.debug("Done writing %s '%s' to path '%s'" % - (f_builder.__class__.__qualname__, f_builder.name, self.source)) + self.logger.debug( + "Done writing %s '%s' to path '%s'" % (f_builder.__class__.__qualname__, f_builder.name, self.source) + ) # Consolidate metadata for the entire file after everything has been written if consolidate_metadata: @@ -522,44 +554,55 @@ def __get_store_path(store): return fpath - def __open_file_consolidated(self, - store, - mode, - synchronizer=None, - storage_options=None): + def __open_file_consolidated(self, store, mode, synchronizer=None, storage_options=None): """ This method will check to see if the metadata has been consolidated. If so, use open_consolidated. """ # This check is just a safeguard for possible errors in the future. But this should never happen - if mode == 'r-': - raise ValueError('Mode r- not allowed for reading with consolidated metadata') + if mode == "r-": + raise ValueError("Mode r- not allowed for reading with consolidated metadata") try: - return zarr.open_consolidated(store=store, - mode=mode, - synchronizer=synchronizer, - storage_options=storage_options) + return zarr.open_consolidated( + store=store, + mode=mode, + synchronizer=synchronizer, + storage_options=storage_options, + ) except KeyError: # A KeyError is raised when the '/.zmetadata' does not exist - return zarr.open(store=store, - mode=mode, - synchronizer=synchronizer, - storage_options=storage_options) - - @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, - {'name': 'builder', 'type': GroupBuilder, 'doc': 'the GroupBuilder to write'}, - {'name': 'link_data', 'type': bool, - 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', 'default': True}, - {'name': 'exhaust_dci', 'type': bool, - 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + - 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', - 'default': True}, - {'name': 'export_source', 'type': str, - 'doc': 'The source of the builders when exporting', 'default': None}, - returns='the Group that was created', rtype='Group') + return zarr.open( + store=store, + mode=mode, + synchronizer=synchronizer, + storage_options=storage_options, + ) + + @docval( + {"name": "parent", "type": Group, "doc": "the parent Zarr object"}, + {"name": "builder", "type": GroupBuilder, "doc": "the GroupBuilder to write"}, + { + "name": "link_data", + "type": bool, + "doc": "If not specified otherwise link (True) or copy (False) Zarr Datasets", + "default": True, + }, + { + "name": "exhaust_dci", + "type": bool, + "doc": ( + "exhaust DataChunkIterators one at a time. If False, add " + "them to the internal queue self.__dci_queue and exhaust them concurrently at the end" + ), + "default": True, + }, + {"name": "export_source", "type": str, "doc": "The source of the builders when exporting", "default": None}, + returns="the Group that was created", + rtype="Group", + ) def write_group(self, **kwargs): """Write a GroupBuider to file""" parent, builder, link_data, exhaust_dci, export_source = getargs( - 'parent', 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs + "parent", "builder", "link_data", "exhaust_dci", "export_source", kwargs ) if self.get_written(builder): @@ -575,7 +618,7 @@ def write_group(self, **kwargs): builder=sub_builder, link_data=link_data, exhaust_dci=exhaust_dci, - export_source=export_source + export_source=export_source, ) datasets = builder.datasets @@ -600,13 +643,17 @@ def write_group(self, **kwargs): self._written_builders.set_written(builder) # record that the builder has been written return group - @docval({'name': 'obj', 'type': (Group, Array), 'doc': 'the Zarr object to add attributes to'}, - {'name': 'attributes', - 'type': dict, - 'doc': 'a dict containing the attributes on the Group or Dataset, indexed by attribute name'}) + @docval( + {"name": "obj", "type": (Group, Array), "doc": "the Zarr object to add attributes to"}, + { + "name": "attributes", + "type": dict, + "doc": "a dict containing the attributes on the Group or Dataset, indexed by attribute name", + }, + ) def write_attributes(self, **kwargs): """Set (i.e., write) the attributes on a given Zarr Group or Array.""" - obj, attributes = getargs('obj', 'attributes', kwargs) + obj, attributes = getargs("obj", "attributes", kwargs) for key, value in attributes.items(): # Case 1: list, set, tuple type attributes @@ -622,19 +669,24 @@ def write_attributes(self, **kwargs): # Numpy scalars and bytes are not JSON serializable. Try to convert to a serializable type instead except TypeError as e: try: - tmp = tuple([i.item() - if (isinstance(i, np.generic) and not isinstance(i, np.bytes_)) - else i.decode("utf-8") - if isinstance(i, (bytes, np.bytes_)) - else i - for i in value]) + # TODO: refactor this to be more readable + tmp = tuple( + [ + ( + i.item() + if (isinstance(i, np.generic) and not isinstance(i, np.bytes_)) + else i.decode("utf-8") if isinstance(i, (bytes, np.bytes_)) else i + ) + for i in value + ] + ) obj.attrs[key] = tmp except: # noqa: E722 raise TypeError(str(e) + " type=" + str(type(value)) + " data=" + str(value)) from e # Case 2: References elif isinstance(value, (Builder, ReferenceBuilder)): refs = self._create_ref(value, self.path) - tmp = {'zarr_dtype': 'object', 'value': refs} + tmp = {"zarr_dtype": "object", "value": refs} obj.attrs[key] = tmp # Case 3: Scalar attributes else: @@ -645,11 +697,12 @@ def write_attributes(self, **kwargs): except TypeError as e: try: val = value.item if isinstance(value, np.ndarray) else value - val = value.item() \ - if (isinstance(value, np.generic) and not isinstance(value, np.bytes_)) \ - else val.decode("utf-8") \ - if isinstance(value, (bytes, np.bytes_)) \ - else val + # TODO: refactor this to be more readable + val = ( + value.item() + if (isinstance(value, np.generic) and not isinstance(value, np.bytes_)) + else val.decode("utf-8") if isinstance(value, (bytes, np.bytes_)) else val + ) obj.attrs[key] = val except: # noqa: E722 msg = str(e) + "key=" + key + " type=" + str(type(value)) + " data=" + str(value) @@ -744,10 +797,10 @@ def resolve_ref(self, zarr_ref): 2) the target zarr object within the target file """ # Extract the path as defined in the zarr_ref object - if zarr_ref.get('source', None) is None: - source_file = str(zarr_ref['path']) + if zarr_ref.get("source", None) is None: + source_file = str(zarr_ref["path"]) else: - source_file = str(zarr_ref['source']) + source_file = str(zarr_ref["source"]) # Resolve the path relative to the current file if not self.is_remote(): source_file = os.path.abspath(source_file) @@ -757,15 +810,17 @@ def resolve_ref(self, zarr_ref): source_path = str(source_file).lstrip(".") source_file = root_path + source_path - object_path = zarr_ref.get('path', None) + object_path = zarr_ref.get("path", None) if object_path: target_name = os.path.basename(object_path) else: target_name = ROOT_NAME - target_zarr_obj = self.__open_file_consolidated(store=source_file, - mode='r', - storage_options=self.__storage_options) + target_zarr_obj = self.__open_file_consolidated( + store=source_file, + mode="r", + storage_options=self.__storage_options, + ) if object_path is not None: try: target_zarr_obj = target_zarr_obj[object_path] @@ -793,7 +848,7 @@ def _create_ref(self, ref_object, ref_link_source=None): path = self.__get_path(builder) # This is the internal path in the store to the item. # get the object id if available - object_id = builder.get('object_id', None) + object_id = builder.get("object_id", None) # determine the object_id of the source by following the parents of the builder until we find the root # the root builder should be the same as the source file containing the reference curr = builder @@ -801,7 +856,7 @@ def _create_ref(self, ref_object, ref_link_source=None): curr = curr.parent if curr: - source_object_id = curr.get('object_id', None) + source_object_id = curr.get("object_id", None) # We did not find ROOT_NAME as a parent. This should only happen if we have an invalid # file as a source, e.g., if during testing we use an arbitrary builder. We check this # anyways to avoid potential errors just in case @@ -814,9 +869,9 @@ def _create_ref(self, ref_object, ref_link_source=None): # between backends a user should always use export which takes care of creating a clean set of builders. if ref_link_source is None: # TODO: Refactor appending a dataset of references so this doesn't need to be called. - ref_link_source = (builder.source - if (builder.source is not None and os.path.isdir(builder.source)) - else self.source) + ref_link_source = ( + builder.source if (builder.source is not None and os.path.isdir(builder.source)) else self.source + ) if not isinstance(ref_link_source, str): # self.path is sometimes given as the ref_link_source. It can @@ -844,7 +899,8 @@ def _create_ref(self, ref_object, ref_link_source=None): source=rel_source, path=path, object_id=object_id, - source_object_id=source_object_id) + source_object_id=source_object_id, + ) return ref def __add_link__(self, parent, target_source, target_path, link_name): @@ -858,23 +914,25 @@ def __add_link__(self, parent, target_source, target_path, link_name): :param link_name: Name of the link :type link_name: str """ - if 'zarr_link' not in parent.attrs: - parent.attrs['zarr_link'] = [] - zarr_link = list(parent.attrs['zarr_link']) - if not isinstance(target_source, str): # a store + if "zarr_link" not in parent.attrs: + parent.attrs["zarr_link"] = [] + zarr_link = list(parent.attrs["zarr_link"]) + if not isinstance(target_source, str): # a store target_source = target_source.path - zarr_link.append({'source': target_source, 'path': target_path, 'name': link_name}) - parent.attrs['zarr_link'] = zarr_link + zarr_link.append({"source": target_source, "path": target_path, "name": link_name}) + parent.attrs["zarr_link"] = zarr_link - @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, - {'name': 'builder', 'type': LinkBuilder, 'doc': 'the LinkBuilder to write'}, - {'name': 'export_source', 'type': str, - 'doc': 'The source of the builders when exporting', 'default': None},) + @docval( + {"name": "parent", "type": Group, "doc": "the parent Zarr object"}, + {"name": "builder", "type": LinkBuilder, "doc": "the LinkBuilder to write"}, + {"name": "export_source", "type": str, "doc": "The source of the builders when exporting", "default": None}, + ) def write_link(self, **kwargs): - parent, builder, export_source = getargs('parent', 'builder', 'export_source', kwargs) + parent, builder, export_source = getargs("parent", "builder", "export_source", kwargs) if self.get_written(builder): - self.logger.debug("Skipping LinkBuilder '%s' already written to parent group '%s'" - % (builder.name, parent.name)) + self.logger.debug( + "Skipping LinkBuilder '%s' already written to parent group '%s'" % (builder.name, parent.name) + ) return self.logger.debug("Writing LinkBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) @@ -925,49 +983,64 @@ def __setup_chunked_dataset__(cls, parent, name, data, options=None): """ io_settings = {} if options is not None: - if 'io_settings' in options: - io_settings = options.get('io_settings') + if "io_settings" in options: + io_settings = options.get("io_settings") # Define the chunking options if the user has not set them explicitly. We need chunking for the iterative write. - if 'chunks' not in io_settings: + if "chunks" not in io_settings: recommended_chunks = data.recommended_chunk_shape() - io_settings['chunks'] = True if recommended_chunks is None else recommended_chunks + io_settings["chunks"] = True if recommended_chunks is None else recommended_chunks # Define the shape of the data if not provided by the user - if 'shape' not in io_settings: - io_settings['shape'] = data.recommended_data_shape() - if 'dtype' not in io_settings: - if (options is not None) and ('dtype' in options): - io_settings['dtype'] = options['dtype'] + if "shape" not in io_settings: + io_settings["shape"] = data.recommended_data_shape() + if "dtype" not in io_settings: + if (options is not None) and ("dtype" in options): + io_settings["dtype"] = options["dtype"] else: - io_settings['dtype'] = data.dtype - if isinstance(io_settings['dtype'], str): + io_settings["dtype"] = data.dtype + if isinstance(io_settings["dtype"], str): # map to real dtype if we were given a string - io_settings['dtype'] = cls.__dtypes.get(io_settings['dtype']) + io_settings["dtype"] = cls.__dtypes.get(io_settings["dtype"]) try: dset = parent.create_dataset(name, **io_settings) - dset.attrs['zarr_dtype'] = np.dtype(io_settings['dtype']).str + dset.attrs["zarr_dtype"] = np.dtype(io_settings["dtype"]).str except Exception as exc: raise Exception("Could not create dataset %s in %s" % (name, parent.name)) from exc return dset - @docval({'name': 'parent', 'type': Group, 'doc': 'the parent Zarr object'}, # noqa: C901 - {'name': 'builder', 'type': DatasetBuilder, 'doc': 'the DatasetBuilder to write'}, - {'name': 'link_data', 'type': bool, - 'doc': 'If not specified otherwise link (True) or copy (False) Zarr Datasets', 'default': True}, - {'name': 'exhaust_dci', 'type': bool, - 'doc': 'exhaust DataChunkIterators one at a time. If False, add ' + - 'them to the internal queue self.__dci_queue and exhaust them concurrently at the end', - 'default': True}, - {'name': 'force_data', 'type': None, - 'doc': 'Used internally to force the data being used when we have to load the data', 'default': None}, - {'name': 'export_source', 'type': str, - 'doc': 'The source of the builders when exporting', 'default': None}, - returns='the Zarr array that was created', rtype=Array) + @docval( + {"name": "parent", "type": Group, "doc": "the parent Zarr object"}, # noqa: C901 + {"name": "builder", "type": DatasetBuilder, "doc": "the DatasetBuilder to write"}, + { + "name": "link_data", + "type": bool, + "doc": "If not specified otherwise link (True) or copy (False) Zarr Datasets", + "default": True, + }, + { + "name": "exhaust_dci", + "type": bool, + "doc": ( + "exhaust DataChunkIterators one at a time. If False, add " + "them to the internal queue self.__dci_queue and exhaust them concurrently at the end" + ), + "default": True, + }, + { + "name": "force_data", + "type": None, + "doc": "Used internally to force the data being used when we have to load the data", + "default": None, + }, + {"name": "export_source", "type": str, "doc": "The source of the builders when exporting", "default": None}, + returns="the Zarr array that was created", + rtype=Array, + ) def write_dataset(self, **kwargs): # noqa: C901 parent, builder, link_data, exhaust_dci, export_source = getargs( - 'parent', 'builder', 'link_data', 'exhaust_dci', 'export_source', kwargs + "parent", "builder", "link_data", "exhaust_dci", "export_source", kwargs ) - force_data = getargs('force_data', kwargs) + force_data = getargs("force_data", kwargs) if exhaust_dci and self.__dci_queue is None: self.__dci_queue = ZarrIODataChunkIteratorQueue() @@ -983,17 +1056,17 @@ def write_dataset(self, **kwargs): # noqa: C901 data = ZarrDataIO.from_h5py_dataset(h5dataset=data) # Separate data values and io_settings for write if isinstance(data, ZarrDataIO): - options['io_settings'] = data.io_settings + options["io_settings"] = data.io_settings link_data = data.link_data data = data.data else: - options['io_settings'] = {} + options["io_settings"] = {} if builder.dimension_labels is not None: - builder.attributes['_ARRAY_DIMENSIONS'] = builder.dimension_labels + builder.attributes["_ARRAY_DIMENSIONS"] = builder.dimension_labels attributes = builder.attributes - options['dtype'] = builder.dtype + options["dtype"] = builder.dtype linked = False @@ -1003,12 +1076,12 @@ def write_dataset(self, **kwargs): # noqa: C901 # copy the dataset data_filename = self.__get_store_path(data.store) if link_data: - if export_source is None: # not exporting + if export_source is None: # not exporting self.__add_link__(parent, data_filename, data.name, name) linked = True dset = None - else: # exporting - data_parent = '/'.join(data.name.split('/')[:-1]) + else: # exporting + data_parent = "/".join(data.name.split("/")[:-1]) # Case 1: The dataset is NOT in the export source, create a link to preserve the external link. # I have three files, FileA, FileB, FileC. I want to export FileA to FileB. FileA has an # EXTERNAL link to a dataset in Filec. This case preserves the link to FileC to also be in FileB. @@ -1040,14 +1113,16 @@ def write_dataset(self, **kwargs): # noqa: C901 # If we have a dataset of containers we need to make the references to the containers if len(data) > 0 and isinstance(data[0], Container): ref_data = [self._create_ref(data[i], ref_link_source=self.path) for i in range(len(data))] - shape = (len(data), ) - type_str = 'object' - dset = parent.require_dataset(name, - shape=shape, - dtype=object, - object_codec=self.__codec_cls(), - **options['io_settings']) - dset.attrs['zarr_dtype'] = type_str + shape = (len(data),) + type_str = "object" + dset = parent.require_dataset( + name, + shape=shape, + dtype=object, + object_codec=self.__codec_cls(), + **options["io_settings"], + ) + dset.attrs["zarr_dtype"] = type_str dset[:] = ref_data self._written_builders.set_written(builder) # record that the builder has been written # If we have a regular dataset, then load the data and write the builder after load @@ -1059,22 +1134,26 @@ def write_dataset(self, **kwargs): # noqa: C901 # We can/should not update the data in the builder itself so we load the data here and instead # force write_dataset when we call it recursively to use the data we loaded, rather than the # dataset that is set on the builder - dset = self.write_dataset(parent=parent, - builder=builder, - link_data=link_data, - force_data=data[:], - export_source=export_source) + dset = self.write_dataset( + parent=parent, + builder=builder, + link_data=link_data, + force_data=data[:], + export_source=export_source, + ) self._written_builders.set_written(builder) # record that the builder has been written # Write a compound dataset - elif isinstance(options['dtype'], list): + elif isinstance(options["dtype"], list): refs = list() type_str = list() - for i, dts in enumerate(options['dtype']): - if self.__is_ref(dts['dtype']): + for i, dts in enumerate(options["dtype"]): + if self.__is_ref(dts["dtype"]): refs.append(i) - type_str.append({'name': dts['name'], 'dtype': 'object'}) + type_str.append({"name": dts["name"], "dtype": "object"}) else: - i = list([dts, ]) + i = [ + dts, + ] t = self.__resolve_dtype_helper__(i) type_str.append(self.__serial_dtype__(t)[0]) @@ -1097,19 +1176,24 @@ def write_dataset(self, **kwargs): # noqa: C901 # dtype = self.__resolve_dtype_helper__(options['dtype']) new_dtype = [] - for field in options['dtype']: - if field['dtype'] is str or field['dtype'] in ( - 'str', 'text', 'utf', 'utf8', 'utf-8', 'isodatetime' + for field in options["dtype"]: + if field["dtype"] is str or field["dtype"] in ( + "str", + "text", + "utf", + "utf8", + "utf-8", + "isodatetime", ): # Zarr does not support variable length strings - new_dtype.append((field['name'], 'O')) - elif isinstance(field['dtype'], dict): + new_dtype.append((field["name"], "O")) + elif isinstance(field["dtype"], dict): # eg. for some references, dtype will be of the form # {'target_type': 'Baz', 'reftype': 'object'} # which should just get serialized as an object - new_dtype.append((field['name'], 'O')) + new_dtype.append((field["name"], "O")) else: - new_dtype.append((field['name'], self.__resolve_dtype_helper__(field['dtype']))) + new_dtype.append((field["name"], self.__resolve_dtype_helper__(field["dtype"]))) dtype = np.dtype(new_dtype) # cast and store compound dataset @@ -1119,34 +1203,36 @@ def write_dataset(self, **kwargs): # noqa: C901 shape=(len(arr),), dtype=dtype, object_codec=self.__codec_cls(), - **options['io_settings'] + **options["io_settings"], ) - dset.attrs['zarr_dtype'] = type_str + dset.attrs["zarr_dtype"] = type_str dset[...] = arr else: # write a compound datatype dset = self.__list_fill__(parent, name, data, options) # Write a dataset of references - elif self.__is_ref(options['dtype']): + elif self.__is_ref(options["dtype"]): # Note: ref_link_source is set to self.path because we do not do external references # We only support external links. if isinstance(data, ReferenceBuilder): shape = (1,) - type_str = 'object' + type_str = "object" refs = self._create_ref(data, ref_link_source=self.path) else: - shape = (len(data), ) - type_str = 'object' + shape = (len(data),) + type_str = "object" refs = [self._create_ref(item, ref_link_source=self.path) for item in data] - dset = parent.require_dataset(name, - shape=shape, - dtype=object, - object_codec=self.__codec_cls(), - **options['io_settings']) + dset = parent.require_dataset( + name, + shape=shape, + dtype=object, + object_codec=self.__codec_cls(), + **options["io_settings"], + ) self._written_builders.set_written(builder) # record that the builder has been written - dset.attrs['zarr_dtype'] = type_str - if hasattr(refs, '__len__'): + dset.attrs["zarr_dtype"] = type_str + if hasattr(refs, "__len__"): dset[:] = np.array(refs) else: dset[0] = refs @@ -1158,7 +1244,7 @@ def write_dataset(self, **kwargs): # noqa: C901 elif isinstance(data, AbstractDataChunkIterator): dset = self.__setup_chunked_dataset__(parent, name, data, options) self.__dci_queue.append(dataset=dset, data=data) - elif hasattr(data, '__len__'): + elif hasattr(data, "__len__"): dset = self.__list_fill__(parent, name, data, options) else: dset = self.__scalar_fill__(parent, name, data, options) @@ -1191,7 +1277,7 @@ def write_dataset(self, **kwargs): # noqa: C901 "utf8": str, "utf-8": str, "ascii": bytes, - "bytes": bytes, + "bytes": bytes, "str": str, "isodatetime": str, "string_": bytes, @@ -1214,13 +1300,13 @@ def __serial_dtype__(cls, dtype): ret = list() for n in dtype.names: item = dict() - item['name'] = n - item['dtype'] = cls.__serial_dtype__(dtype[n]) + item["name"] = n + item["dtype"] = cls.__serial_dtype__(dtype[n]) ret.append(item) return ret # TODO Does not work when Reference in compound datatype elif dtype == ZarrReference: - return 'object' + return "object" @classmethod def __resolve_dtype__(cls, dtype, data): @@ -1238,11 +1324,11 @@ def __resolve_dtype_helper__(cls, dtype): elif isinstance(dtype, str): return cls.__dtypes.get(dtype) elif isinstance(dtype, dict): - return cls.__dtypes.get(dtype['reftype']) + return cls.__dtypes.get(dtype["reftype"]) elif isinstance(dtype, list): - return np.dtype([(x['name'], cls.__resolve_dtype_helper__(x['dtype'])) for x in dtype]) + return np.dtype([(x["name"], cls.__resolve_dtype_helper__(x["dtype"])) for x in dtype]) else: - raise ValueError(f'Cant resolve dtype {dtype}') + raise ValueError(f"Can't resolve dtype {dtype}") @classmethod def get_type(cls, data): @@ -1250,50 +1336,50 @@ def get_type(cls, data): return cls.__dtypes.get("str") elif isinstance(data, bytes): return cls.__dtypes.get("bytes") - elif not hasattr(data, '__len__'): + elif not hasattr(data, "__len__"): return type(data) else: if len(data) == 0: - raise ValueError('cannot determine type for empty data') + raise ValueError("cannot determine type for empty data") return cls.get_type(data[0]) - __reserve_attribute = ('zarr_dtype', 'zarr_link') + __reserve_attribute = ("zarr_dtype", "zarr_link") def __list_fill__(self, parent, name, data, options=None): # noqa: C901 dtype = None io_settings = dict() if options is not None: - dtype = options.get('dtype') - if options.get('io_settings') is not None: - io_settings = options.get('io_settings') + dtype = options.get("dtype") + if options.get("io_settings") is not None: + io_settings = options.get("io_settings") # Determine the dtype if not isinstance(dtype, type): try: dtype = self.__resolve_dtype__(dtype, data) except Exception as exc: - msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) # noqa: F821 + msg = "cannot add %s to %s - could not determine type" % (name, parent.name) # noqa: F821 raise Exception(msg) from exc # Set the type_str type_str = self.__serial_dtype__(dtype) # Determine the shape and update the dtype if necessary when dtype==object - if 'shape' in io_settings: # Use the shape set by the user - data_shape = io_settings.pop('shape') + if "shape" in io_settings: # Use the shape set by the user + data_shape = io_settings.pop("shape") # If we have a numeric numpy-like array (e.g., numpy.array or h5py.Dataset) then use its shape elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.number) or dtype == np.bool_: # HDMF's get_data_shape may return the maxshape of an HDF5 dataset which can include None values # which Zarr does not allow for dataset shape. Check for the shape attribute first before falling # back on get_data_shape - if hasattr(data, 'shape') and data.shape is not None: + if hasattr(data, "shape") and data.shape is not None: data_shape = data.shape # This is a fall-back just in case. However this should not happen for standard numpy and h5py arrays - else: # pragma: no cover - data_shape = get_data_shape(data) # pragma: no cover + else: # pragma: no cover + data_shape = get_data_shape(data) # pragma: no cover # Deal with object dtype elif isinstance(dtype, np.dtype): data = data[:] # load the data in case we come from HDF5 or another on-disk data source we don't know - data_shape = (len(data), ) + data_shape = (len(data),) # if we have a compound data type if dtype.names: data_shape = get_data_shape(data) @@ -1302,7 +1388,7 @@ def __list_fill__(self, parent, name, data, options=None): # noqa: C901 for substype in dtype.fields.items(): if np.issubdtype(substype[1][0], np.flexible) or np.issubdtype(substype[1][0], np.object_): dtype = object - io_settings['object_codec'] = self.__codec_cls() + io_settings["object_codec"] = self.__codec_cls() break # sometimes bytes and strings can hide as object in numpy array so lets try # to write those as strings and bytes rather than as objects @@ -1316,17 +1402,17 @@ def __list_fill__(self, parent, name, data, options=None): # noqa: C901 # Set encoding for objects else: dtype = object - io_settings['object_codec'] = self.__codec_cls() + io_settings["object_codec"] = self.__codec_cls() # Determine the shape from the data if all other cases have not been hit else: data_shape = get_data_shape(data) # Create the dataset dset = parent.require_dataset(name, shape=data_shape, dtype=dtype, **io_settings) - dset.attrs['zarr_dtype'] = type_str + dset.attrs["zarr_dtype"] = type_str # Write the data to file - if dtype == object: # noqa: E721 + if dtype == object: # noqa: E721 for c in np.ndindex(data_shape): o = data for i in c: @@ -1344,7 +1430,7 @@ def __list_fill__(self, parent, name, data, options=None): # noqa: C901 except ValueError: for i in range(len(data)): dset[i] = data[i] - except TypeError: # If data is an h5py.Dataset with strings, they may need to be decoded + except TypeError: # If data is an h5py.Dataset with strings, they may need to be decoded for c in np.ndindex(data_shape): o = data for i in c: @@ -1357,26 +1443,26 @@ def __scalar_fill__(self, parent, name, data, options=None): dtype = None io_settings = dict() if options is not None: - dtype = options.get('dtype') - io_settings = options.get('io_settings') + dtype = options.get("dtype") + io_settings = options.get("io_settings") if io_settings is None: io_settings = dict() if not isinstance(dtype, type): try: dtype = self.__resolve_dtype__(dtype, data) except Exception as exc: - msg = 'cannot add %s to %s - could not determine type' % (name, parent.name) + msg = "cannot add %s to %s - could not determine type" % (name, parent.name) raise Exception(msg) from exc - if dtype == object: # noqa: E721 - io_settings['object_codec'] = self.__codec_cls() + if dtype == object: # noqa: E721 + io_settings["object_codec"] = self.__codec_cls() - dset = parent.require_dataset(name, shape=(1, ), dtype=dtype, **io_settings) + dset = parent.require_dataset(name, shape=(1,), dtype=dtype, **io_settings) dset[:] = data - type_str = 'scalar' - dset.attrs['zarr_dtype'] = type_str + type_str = "scalar" + dset.attrs["zarr_dtype"] = type_str return dset - @docval(returns='a GroupBuilder representing the NWB Dataset', rtype='GroupBuilder') + @docval(returns="a GroupBuilder representing the NWB Dataset", rtype="GroupBuilder") def read_builder(self): f_builder = self.__read_group(self.__file, ROOT_NAME) return f_builder @@ -1387,31 +1473,37 @@ def __set_built(self, zarr_obj, builder): path = os.path.join(fpath, path) self.__built.setdefault(path, builder) - @docval({'name': 'zarr_obj', 'type': (Array, Group), - 'doc': 'the Zarr object to the corresponding Container/Data object for'}) + @docval( + { + "name": "zarr_obj", + "type": (Array, Group), + "doc": "the Zarr object to the corresponding Container/Data object for", + } + ) def get_container(self, **kwargs): """ Get the container for the corresponding Zarr Group or Dataset :raises ValueError: When no builder has been constructed yet for the given h5py object """ - zarr_obj = getargs('zarr_obj', kwargs) + zarr_obj = getargs("zarr_obj", kwargs) builder = self.get_builder(zarr_obj) container = self.manager.construct(builder) return container # TODO: This method should be moved to HDMFIO - @docval({'name': 'zarr_obj', 'type': (Array, Group), - 'doc': 'the Zarr object to the corresponding Builder object for'}) + @docval( + {"name": "zarr_obj", "type": (Array, Group), "doc": "the Zarr object to the corresponding Builder object for"} + ) def get_builder(self, **kwargs): # TODO: move this to HDMFIO (define skeleton in there at least) """ Get the builder for the corresponding Group or Dataset :raises ValueError: When no builder has been constructed """ - zarr_obj = kwargs['zarr_obj'] + zarr_obj = kwargs["zarr_obj"] builder = self.__get_built(zarr_obj) if builder is None: - msg = '%s has not been built' % (zarr_obj.name) + msg = "%s has not been built" % (zarr_obj.name) raise ValueError(msg) return builder @@ -1474,10 +1566,10 @@ def __read_links(self, zarr_obj, parent): :type parent: GroupBuilder """ # read links - if 'zarr_link' in zarr_obj.attrs: - links = zarr_obj.attrs['zarr_link'] + if "zarr_link" in zarr_obj.attrs: + links = zarr_obj.attrs["zarr_link"] for link in links: - link_name = link['name'] + link_name = link["name"] target_name, target_zarr_obj = self.resolve_ref(link) # NOTE: __read_group and __read_dataset return the cached builders if the target has already been built if isinstance(target_zarr_obj, Group): @@ -1494,9 +1586,9 @@ def __read_dataset(self, zarr_obj, name): if ret is not None: return ret - if 'zarr_dtype' in zarr_obj.attrs: - zarr_dtype = zarr_obj.attrs['zarr_dtype'] - elif hasattr(zarr_obj, 'dtype'): # Fallback for invalid files that are missing zarr_type + if "zarr_dtype" in zarr_obj.attrs: + zarr_dtype = zarr_obj.attrs["zarr_dtype"] + elif hasattr(zarr_obj, "dtype"): # Fallback for invalid files that are missing zarr_type zarr_dtype = zarr_obj.dtype warnings.warn( "Inferred dtype from zarr type. Dataset missing zarr_dtype: " + str(name) + " " + str(zarr_obj) @@ -1509,35 +1601,37 @@ def __read_dataset(self, zarr_obj, name): else: source = zarr_obj.store.path - kwargs = {"attributes": self.__read_attrs(zarr_obj), - "dtype": zarr_dtype, - "maxshape": zarr_obj.shape, - "chunks": not (zarr_obj.shape == zarr_obj.chunks), - "source": source} - dtype = kwargs['dtype'] + kwargs = { + "attributes": self.__read_attrs(zarr_obj), + "dtype": zarr_dtype, + "maxshape": zarr_obj.shape, + "chunks": not (zarr_obj.shape == zarr_obj.chunks), + "source": source, + } + dtype = kwargs["dtype"] # By default, use the zarr.core.Array as data for lazy data load data = zarr_obj # Read scalar dataset - if dtype == 'scalar': + if dtype == "scalar": data = zarr_obj[()] if isinstance(dtype, list): # Check compound dataset where one of the subsets contains references has_reference = False for i, dts in enumerate(dtype): - if dts['dtype'] == 'object': # check items for object reference + if dts["dtype"] == "object": # check items for object reference has_reference = True break - retrieved_dtypes = [dtype_dict['dtype'] for dtype_dict in dtype] + retrieved_dtypes = [dtype_dict["dtype"] for dtype_dict in dtype] if has_reference: data = BuilderZarrTableDataset(zarr_obj, self, retrieved_dtypes) elif self.__is_ref(dtype): # Array of references data = BuilderZarrReferenceDataset(data, self) - kwargs['data'] = data + kwargs["data"] = data if name is None: name = str(os.path.basename(zarr_obj.name)) ret = DatasetBuilder(name, **kwargs) # create builder object for dataset @@ -1551,9 +1645,9 @@ def __read_attrs(self, zarr_obj): for k in zarr_obj.attrs.keys(): if k not in self.__reserve_attribute: v = zarr_obj.attrs[k] - if isinstance(v, dict) and 'zarr_dtype' in v: - if v['zarr_dtype'] == 'object': - target_name, target_zarr_obj = self.resolve_ref(v['value']) + if isinstance(v, dict) and "zarr_dtype" in v: + if v["zarr_dtype"] == "object": + target_name, target_zarr_obj = self.resolve_ref(v["value"]) if isinstance(target_zarr_obj, zarr.hierarchy.Group): ret[k] = self.__read_group(target_zarr_obj, target_name) else: diff --git a/src/hdmf_zarr/nwb.py b/src/hdmf_zarr/nwb.py index 2b4d3bfb..97a62573 100644 --- a/src/hdmf_zarr/nwb.py +++ b/src/hdmf_zarr/nwb.py @@ -1,13 +1,11 @@ """Module with Zarr backend for NWB for integration with PyNWB""" + from pathlib import Path from .backend import ZarrIO, SUPPORTED_ZARR_STORES -from hdmf.utils import (docval, - popargs, - get_docval) +from hdmf.utils import docval, popargs, get_docval from hdmf.backends.io import HDMFIO -from hdmf.build import (BuildManager, - TypeMap) +from hdmf.build import BuildManager, TypeMap from pynwb import get_manager, get_type_map @@ -19,25 +17,34 @@ class NWBZarrIO(ZarrIO): is to perform default setup for BuildManager, loading or namespaces etc., in the context of the NWB format. """ - @docval(*get_docval(ZarrIO.__init__), - {'name': 'load_namespaces', 'type': bool, - 'doc': 'whether or not to load cached namespaces from given path - not applicable in write mode', - 'default': True}, - {'name': 'extensions', 'type': (str, TypeMap, list), - 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', - 'default': None}) + + @docval( + *get_docval(ZarrIO.__init__), + { + "name": "load_namespaces", + "type": bool, + "doc": "whether or not to load cached namespaces from given path - not applicable in write mode", + "default": True, + }, + { + "name": "extensions", + "type": (str, TypeMap, list), + "doc": "a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps", + "default": None, + }, + ) def __init__(self, **kwargs): - path, mode, manager, extensions, load_namespaces, synchronizer, storage_options = \ - popargs('path', 'mode', 'manager', 'extensions', - 'load_namespaces', 'synchronizer', 'storage_options', kwargs) + path, mode, manager, extensions, load_namespaces, synchronizer, storage_options = popargs( + "path", "mode", "manager", "extensions", "load_namespaces", "synchronizer", "storage_options", kwargs + ) - io_modes_that_create_file = ['w', 'w-', 'x'] + io_modes_that_create_file = ["w", "w-", "x"] if mode in io_modes_that_create_file or manager is not None or extensions is not None: load_namespaces = False if load_namespaces: tm = get_type_map() - super(NWBZarrIO, self).load_namespaces(tm, path, storage_options) + super().load_namespaces(tm, path, storage_options) manager = BuildManager(tm) else: if manager is not None and extensions is not None: @@ -46,34 +53,38 @@ def __init__(self, **kwargs): manager = get_manager(extensions=extensions) elif manager is None: manager = get_manager() - super(NWBZarrIO, self).__init__(path, - manager=manager, - mode=mode, - synchronizer=synchronizer, - storage_options=storage_options) + super().__init__(path, manager=manager, mode=mode, synchronizer=synchronizer, storage_options=storage_options) - @docval({'name': 'src_io', 'type': HDMFIO, 'doc': 'the HDMFIO object for reading the data to export'}, - {'name': 'nwbfile', 'type': 'NWBFile', - 'doc': 'the NWBFile object to export. If None, then the entire contents of src_io will be exported', - 'default': None}, - {'name': 'write_args', 'type': dict, 'doc': 'arguments to pass to :py:meth:`write_builder`', - 'default': dict()}) + @docval( + {"name": "src_io", "type": HDMFIO, "doc": "the HDMFIO object for reading the data to export"}, + { + "name": "nwbfile", + "type": "NWBFile", + "doc": "the NWBFile object to export. If None, then the entire contents of src_io will be exported", + "default": None, + }, + {"name": "write_args", "type": dict, "doc": "arguments to pass to :py:meth:`write_builder`", "default": dict()}, + ) def export(self, **kwargs): - nwbfile = popargs('nwbfile', kwargs) - kwargs['container'] = nwbfile + nwbfile = popargs("nwbfile", kwargs) + kwargs["container"] = nwbfile super().export(**kwargs) @staticmethod - @docval({'name': 'path', - 'type': (str, Path, *SUPPORTED_ZARR_STORES), - 'doc': 'the path to the Zarr file or a supported Zarr store'}, - is_method=False) + @docval( + { + "name": "path", + "type": (str, Path, *SUPPORTED_ZARR_STORES), + "doc": "the path to the Zarr file or a supported Zarr store", + }, + is_method=False, + ) def read_nwb(**kwargs): """ Helper factory method for reading an NWB file and return the NWBFile object """ # Retrieve the filepath - path = popargs('path', kwargs) + path = popargs("path", kwargs) if isinstance(path, Path): path = str(path) # determine default storage options to use when opening a file from S3 diff --git a/src/hdmf_zarr/utils.py b/src/hdmf_zarr/utils.py index 1c012a22..e51b137b 100644 --- a/src/hdmf_zarr/utils.py +++ b/src/hdmf_zarr/utils.py @@ -1,4 +1,5 @@ """Collection of utility I/O classes for the ZarrIO backend store.""" + import gc import traceback import multiprocessing @@ -47,13 +48,14 @@ class ZarrIODataChunkIteratorQueue(deque): Note that "fork" is only available on UNIX systems (not Windows). :type multiprocessing_context: string or None """ + def __init__( self, number_of_jobs: int = 1, max_threads_per_process: Union[None, int] = None, multiprocessing_context: Union[None, Literal["fork", "spawn"]] = None, ): - self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) + self.logger = logging.getLogger("%s.%s" % (self.__class__.__module__, self.__class__.__qualname__)) self.number_of_jobs = number_of_jobs self.max_threads_per_process = max_threads_per_process @@ -118,8 +120,7 @@ def exhaust_queue(self): display_progress = False r_bar_in_MB = ( - "| {n_fmt}/{total_fmt} MB [Elapsed: {elapsed}, " - "Remaining: {remaining}, Rate:{rate_fmt}{postfix}]" + "| {n_fmt}/{total_fmt} MB [Elapsed: {elapsed}, Remaining: {remaining}, Rate:{rate_fmt}{postfix}]" ) bar_format = "{l_bar}{bar}" + f"{r_bar_in_MB}" progress_bar_options = dict( @@ -128,7 +129,7 @@ def exhaust_queue(self): bar_format=bar_format, unit="MB", ) - for (zarr_dataset, iterator) in iter(self): + for zarr_dataset, iterator in iter(self): # Parallel write only works well with GenericDataChunkIterators # Due to perfect alignment between chunks and buffers if not isinstance(iterator, GenericDataChunkIterator): @@ -149,7 +150,8 @@ def exhaust_queue(self): display_progress = display_progress or iterator.display_progress iterator.display_progress = False per_iterator_progress_options = { - key: value for key, value in iterator.progress_bar_options.items() + key: value + for key, value in iterator.progress_bar_options.items() if key not in ["desc", "total", "file"] } progress_bar_options.update(**per_iterator_progress_options) @@ -158,9 +160,9 @@ def exhaust_queue(self): for buffer_selection in iterator.buffer_selection_generator: buffer_map_args = (zarr_dataset.store.path, zarr_dataset.path, iterator, buffer_selection) buffer_map.append(buffer_map_args) - buffer_size_in_MB = math.prod( - [slice_.stop - slice_.start for slice_ in buffer_selection] - ) * iterator_itemsize / 1e6 + buffer_size_in_MB = ( + math.prod([slice_.stop - slice_.start for slice_ in buffer_selection]) * iterator_itemsize / 1e6 + ) size_in_MB_per_iteration.append(buffer_size_in_MB) progress_bar_options.update( total=int(sum(size_in_MB_per_iteration)), # int() to round down to nearest integer for better display @@ -168,7 +170,7 @@ def exhaust_queue(self): if parallelizable_iterators: # Avoid spinning up ProcessPool if no candidates during this exhaustion # Remove candidates for parallelization from the queue - for (zarr_dataset, iterator) in parallelizable_iterators: + for zarr_dataset, iterator in parallelizable_iterators: self.remove((zarr_dataset, iterator)) operation_to_run = self._write_buffer_zarr @@ -182,7 +184,7 @@ def exhaust_queue(self): operation_to_run, process_initialization, initialization_arguments, - self.max_threads_per_process + self.max_threads_per_process, ), ) as executor: results = executor.map(self.function_wrapper, buffer_map) @@ -263,7 +265,7 @@ def initializer_wrapper( operation_to_run: callable, process_initialization: callable, initialization_arguments: Iterable, # TODO: eventually standardize with typing.Iterable[typing.Any] - max_threads_per_process: Optional[int] = None + max_threads_per_process: Optional[int] = None, ): # keyword arguments here are just for readability, ProcessPool only takes a tuple """ Needed as a part of a bug fix with cloud memory leaks discovered by SpikeInterface team. @@ -320,7 +322,7 @@ def function_wrapper(args: Tuple[str, str, AbstractDataChunkIterator, Tuple[slic zarr_store_path, relative_dataset_path, iterator, - buffer_selection + buffer_selection, ) else: with threadpool_limits(limits=max_threads_per_process): @@ -338,25 +340,27 @@ class ZarrSpecWriter(SpecWriter): Class used to write format specs to Zarr """ - @docval({'name': 'group', 'type': Group, 'doc': 'the Zarr file to write specs to'}) + @docval({"name": "group", "type": Group, "doc": "the Zarr file to write specs to"}) def __init__(self, **kwargs): - self.__group = getargs('group', kwargs) + self.__group = getargs("group", kwargs) @staticmethod def stringify(spec): """ Converts a spec into a JSON string to write to a dataset """ - return json.dumps(spec, separators=(',', ':')) + return json.dumps(spec, separators=(",", ":")) def __write(self, d, name): data = self.stringify(d) - dset = self.__group.require_dataset(name, - shape=(1, ), - dtype=object, - object_codec=numcodecs.JSON(), - compressor=None) - dset.attrs['zarr_dtype'] = 'scalar' + dset = self.__group.require_dataset( + name, + shape=(1,), + dtype=object, + object_codec=numcodecs.JSON(), + compressor=None, + ) + dset.attrs["zarr_dtype"] = "scalar" dset[0] = data return dset @@ -366,7 +370,7 @@ def write_spec(self, spec, path): def write_namespace(self, namespace, path): """Write a namespace to the given path""" - return self.__write({'namespaces': [namespace]}, path) + return self.__write({"namespaces": [namespace]}, path) class ZarrSpecReader(SpecReader): @@ -374,9 +378,9 @@ class ZarrSpecReader(SpecReader): Class to read format specs from Zarr """ - @docval({'name': 'group', 'type': Group, 'doc': 'the Zarr file to read specs from'}) + @docval({"name": "group", "type": Group, "doc": "the Zarr file to read specs from"}) def __init__(self, **kwargs): - self.__group = getargs('group', kwargs) + self.__group = getargs("group", kwargs) source = "%s:%s" % (os.path.abspath(self.__group.store.path), self.__group.name) super().__init__(source=source) self.__cache = None @@ -394,7 +398,7 @@ def read_namespace(self, ns_path): """Read a namespace from the given path""" if self.__cache is None: self.__cache = self.__read(ns_path) - ret = self.__cache['namespaces'] + ret = self.__cache["namespaces"] return ret @@ -404,63 +408,81 @@ class ZarrDataIO(DataIO): for data arrays. """ - @docval({'name': 'data', - 'type': (np.ndarray, list, tuple, zarr.Array, Iterable), - 'doc': 'the data to be written. NOTE: If an zarr.Array is used, all other settings but link_data' + - ' will be ignored as the dataset will either be linked to or copied as is in ZarrIO.'}, - {'name': 'chunks', - 'type': (list, tuple), - 'doc': 'Chunk shape', - 'default': None}, - {'name': 'fillvalue', - 'type': None, - 'doc': 'Value to be returned when reading uninitialized parts of the dataset', - 'default': None}, - {'name': 'compressor', - 'type': (numcodecs.abc.Codec, bool), - 'doc': 'Zarr compressor filter to be used. Set to True to use Zarr default.' - 'Set to False to disable compression)', - 'default': None}, - {'name': 'filters', - 'type': (list, tuple), - 'doc': 'One or more Zarr-supported codecs used to transform data prior to compression.', - 'default': None}, - {'name': 'link_data', - 'type': bool, - 'doc': 'If data is an zarr.Array should it be linked to or copied. NOTE: This parameter is only ' + - 'allowed if data is an zarr.Array', - 'default': False} - ) + @docval( + { + "name": "data", + "type": (np.ndarray, list, tuple, zarr.Array, Iterable), + "doc": ( + "the data to be written. NOTE: If an zarr.Array is used, all other settings but link_data " + "will be ignored as the dataset will either be linked to or copied as is in ZarrIO." + ), + }, + { + "name": "chunks", + "type": (list, tuple), + "doc": "Chunk shape", + "default": None, + }, + { + "name": "fillvalue", + "type": None, + "doc": "Value to be returned when reading uninitialized parts of the dataset", + "default": None, + }, + { + "name": "compressor", + "type": (numcodecs.abc.Codec, bool), + "doc": ( + "Zarr compressor filter to be used. Set to True to use Zarr default. " + "Set to False to disable compression)" + ), + "default": None, + }, + { + "name": "filters", + "type": (list, tuple), + "doc": "One or more Zarr-supported codecs used to transform data prior to compression.", + "default": None, + }, + { + "name": "link_data", + "type": bool, + "doc": ( + "If data is an zarr.Array should it be linked to or copied. NOTE: This parameter is only " + "allowed if data is an zarr.Array" + ), + "default": False, + }, + ) def __init__(self, **kwargs): # TODO Need to add error checks and warnings to ZarrDataIO to check for parameter collisions and add tests data, chunks, fill_value, compressor, filters, self.__link_data = getargs( - 'data', 'chunks', 'fillvalue', 'compressor', 'filters', 'link_data', kwargs) + "data", "chunks", "fillvalue", "compressor", "filters", "link_data", kwargs + ) # NOTE: dtype and shape of the DataIO base class are not yet supported by ZarrDataIO. # These parameters are used to create empty data to allocate the data but # leave the I/O to fill the data to the user. - super(ZarrDataIO, self).__init__(data=data, - dtype=None, - shape=None) + super().__init__(data=data, dtype=None, shape=None) if not isinstance(data, zarr.Array) and self.__link_data: self.__link_data = False self.__iosettings = dict() if chunks is not None: - self.__iosettings['chunks'] = chunks + self.__iosettings["chunks"] = chunks if fill_value is not None: - self.__iosettings['fill_value'] = fill_value + self.__iosettings["fill_value"] = fill_value if compressor is not None: if isinstance(compressor, bool): # Disable compression by setting compressor to None if not compressor: - self.__iosettings['compressor'] = None + self.__iosettings["compressor"] = None # To use default settings simply do not specify any compressor settings else: pass # use the user-specified compressor else: - self.__iosettings['compressor'] = compressor + self.__iosettings["compressor"] = compressor if filters is not None: - self.__iosettings['filters'] = filters + self.__iosettings["filters"] = filters @property def link_data(self) -> bool: @@ -487,16 +509,17 @@ def from_h5py_dataset(h5dataset, **kwargs): :returns: ZarrDataIO object wrapping the dataset """ filters = ZarrDataIO.hdf5_to_zarr_filters(h5dataset) - fillval = h5dataset.fillvalue if 'fillvalue' not in kwargs else kwargs.pop('fillvalue') - if isinstance(fillval, bytes): # bytes are not JSON serializable so use string instead + fillval = h5dataset.fillvalue if "fillvalue" not in kwargs else kwargs.pop("fillvalue") + if isinstance(fillval, bytes): # bytes are not JSON serializable so use string instead fillval = fillval.decode("utf-8") - chunks = h5dataset.chunks if 'chunks' not in kwargs else kwargs.pop('chunks') + chunks = h5dataset.chunks if "chunks" not in kwargs else kwargs.pop("chunks") re = ZarrDataIO( data=h5dataset, filters=filters, fillvalue=fillval, chunks=chunks, - **kwargs) + **kwargs, + ) return re @staticmethod @@ -507,7 +530,7 @@ def hdf5_to_zarr_filters(h5dataset) -> list: # Check for unsupported filters if h5dataset.scaleoffset: # TODO: translate to numcodecs.fixedscaleoffset.FixedScaleOffset() - warn( f"{h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") + warn(f"{h5dataset.name} HDF5 scaleoffset filter ignored in Zarr") if h5dataset.compression in ("szip", "lzf"): warn(f"{h5dataset.name} HDF5 szip or lzf compression ignored in Zarr") # Add the shuffle filter if possible @@ -524,7 +547,8 @@ def hdf5_to_zarr_filters(h5dataset) -> list: blocksize=total_bytes, clevel=clevel, shuffle=shuffle, - cname=blosc_compressors[compressor]) + cname=blosc_compressors[compressor], + ) filters.append(numcodecs.Blosc(**pars)) elif filter_id_str == "32015": filters.append(numcodecs.Zstd(level=properties[0])) @@ -534,7 +558,7 @@ def hdf5_to_zarr_filters(h5dataset) -> list: warn(f"{h5dataset.name} HDF5 lz4 compression ignored in Zarr") elif filter_id_str == "32008": warn(f"{h5dataset.name} HDF5 bitshuffle compression ignored in Zarr") - elif filter_id_str == "shuffle": # already handled above + elif filter_id_str == "shuffle": # already handled above pass else: warn(f"{h5dataset.name} HDF5 filter id {filter_id} with properties {properties} ignored in Zarr.") @@ -543,34 +567,45 @@ def hdf5_to_zarr_filters(h5dataset) -> list: @staticmethod def is_h5py_dataset(obj): """Check if the object is an instance of h5py.Dataset without requiring import of h5py""" - return (obj.__class__.__module__, obj.__class__.__name__) == ('h5py._hl.dataset', 'Dataset') + return (obj.__class__.__module__, obj.__class__.__name__) == ("h5py._hl.dataset", "Dataset") + class ZarrReference(dict): """ Data structure to describe a reference to another container used with the ZarrIO backend """ - @docval({'name': 'source', - 'type': str, - 'doc': 'Source of referenced object. Usually the relative path to the ' - 'Zarr file containing the referenced object', - 'default': None}, - {'name': 'path', - 'type': str, - 'doc': 'Path of referenced object within the source', - 'default': None}, - {'name': 'object_id', - 'type': str, - 'doc': 'Object_id of the referenced object (if available)', - 'default': None}, - {'name': 'source_object_id', - 'type': str, - 'doc': 'Object_id of the source (should always be available)', - 'default': None} - ) + @docval( + { + "name": "source", + "type": str, + "doc": "Source of referenced object. Usually the relative path to the " + "Zarr file containing the referenced object", + "default": None, + }, + { + "name": "path", + "type": str, + "doc": "Path of referenced object within the source", + "default": None, + }, + { + "name": "object_id", + "type": str, + "doc": "Object_id of the referenced object (if available)", + "default": None, + }, + { + "name": "source_object_id", + "type": str, + "doc": "Object_id of the source (should always be available)", + "default": None, + }, + ) def __init__(self, **kwargs): dest_source, dest_path, dest_object_id, dest_source_object_id = getargs( - 'source', 'path', 'object_id', 'source_object_id', kwargs) + "source", "path", "object_id", "source_object_id", kwargs + ) super(ZarrReference, self).__init__() self.source = dest_source self.path = dest_path @@ -579,32 +614,32 @@ def __init__(self, **kwargs): @property def source(self) -> str: - return super(ZarrReference, self).__getitem__('source') + return super().__getitem__("source") @property def path(self) -> str: - return super(ZarrReference, self).__getitem__('path') + return super().__getitem__("path") @property def object_id(self) -> str: - return super(ZarrReference, self).__getitem__('object_id') + return super().__getitem__("object_id") @property def source_object_id(self) -> str: - return super(ZarrReference, self).__getitem__('source_object_id') + return super().__getitem__("source_object_id") @source.setter def source(self, source: str): - super(ZarrReference, self).__setitem__('source', source) + super().__setitem__("source", source) @path.setter def path(self, path: str): - super(ZarrReference, self).__setitem__('path', path) + super().__setitem__("path", path) @object_id.setter def object_id(self, object_id: str): - super(ZarrReference, self).__setitem__('object_id', object_id) + super().__setitem__("object_id", object_id) @source_object_id.setter def source_object_id(self, object_id: str): - super(ZarrReference, self).__setitem__('source_object_id', object_id) + super().__setitem__("source_object_id", object_id) diff --git a/src/hdmf_zarr/zarr_utils.py b/src/hdmf_zarr/zarr_utils.py index c01623d0..e7790c76 100644 --- a/src/hdmf_zarr/zarr_utils.py +++ b/src/hdmf_zarr/zarr_utils.py @@ -3,6 +3,7 @@ e.g., for wrapping Zarr arrays on read, wrapping arrays for configuring write, or writing the spec among others """ + from abc import ABCMeta, abstractmethod from copy import copy import numpy as np @@ -20,10 +21,12 @@ class ZarrDataset(HDMFDataset): Extension of HDMFDataset to add Zarr compatibility """ - @docval({'name': 'dataset', 'type': (np.ndarray, Array), 'doc': 'the Zarr file lazily evaluate'}, - {'name': 'io', 'type': 'ZarrIO', 'doc': 'the IO object that was used to read the underlying dataset'}) + @docval( + {"name": "dataset", "type": (np.ndarray, Array), "doc": "the Zarr file lazily evaluate"}, + {"name": "io", "type": "ZarrIO", "doc": "the IO object that was used to read the underlying dataset"}, + ) def __init__(self, **kwargs): - self.__io = popargs('io', kwargs) + self.__io = popargs("io", kwargs) super().__init__(**kwargs) @property @@ -53,12 +56,12 @@ def invert(self): Return an object that defers reference resolution but in the opposite direction. """ - if not hasattr(self, '__inverted'): + if not hasattr(self, "__inverted"): cls = self.get_inverse_class() docval = get_docval(cls.__init__) kwargs = dict() for arg in docval: - kwargs[arg['name']] = getattr(self, arg['name']) + kwargs[arg["name"]] = getattr(self, arg["name"]) self.__inverted = cls(**kwargs) return self.__inverted @@ -129,12 +132,13 @@ class AbstractZarrTableDataset(DatasetOfReferences): references in compound datasets to either Builders and Containers. """ - @docval({'name': 'dataset', 'type': (np.ndarray, Array), 'doc': 'the Zarr file lazily evaluate'}, - {'name': 'io', 'type': 'ZarrIO', 'doc': 'the IO object that was used to read the underlying dataset'}, - {'name': 'types', 'type': (list, tuple), - 'doc': 'the list/tuple of reference types'}) + @docval( + {"name": "dataset", "type": (np.ndarray, Array), "doc": "the Zarr file lazily evaluate"}, + {"name": "io", "type": "ZarrIO", "doc": "the IO object that was used to read the underlying dataset"}, + {"name": "types", "type": (list, tuple), "doc": "the list/tuple of reference types"}, + ) def __init__(self, **kwargs): - types = popargs('types', kwargs) + types = popargs("types", kwargs) super().__init__(**kwargs) self.__refgetters = dict() for i, t in enumerate(types): @@ -149,15 +153,15 @@ def __init__(self, **kwargs): tmp = list() for i in range(len(self.dataset.dtype)): sub = self.dataset.dtype[i] - if np.issubdtype(sub, np.dtype('O')): - tmp.append('object') + if np.issubdtype(sub, np.dtype("O")): + tmp.append("object") if sub.metadata: - if 'vlen' in sub.metadata: - t = sub.metadata['vlen'] + if "vlen" in sub.metadata: + t = sub.metadata["vlen"] if t is str: - tmp.append('utf') + tmp.append("utf") elif t is bytes: - tmp.append('ascii') + tmp.append("ascii") else: tmp.append(sub.type.__name__) self.__dtype = tmp @@ -188,14 +192,14 @@ def _get_utf(self, string): """ Decode a dataset element to unicode """ - return string.decode('utf-8') if isinstance(string, bytes) else string + return string.decode("utf-8") if isinstance(string, bytes) else string def __get_regref(self, ref): obj = self._get_ref(ref) return obj[ref] def resolve(self, manager): - return self[0:len(self)] + return self[0 : len(self)] def __iter__(self): for i in range(len(self)): @@ -217,7 +221,7 @@ def __getitem__(self, arg): @property def dtype(self): - return 'object' + return "object" class ContainerZarrTableDataset(ContainerResolverMixin, AbstractZarrTableDataset): diff --git a/test_gallery.py b/test_gallery.py index c03fa19b..7a66d006 100644 --- a/test_gallery.py +++ b/test_gallery.py @@ -23,21 +23,13 @@ def _import_from_file(script): spec.loader.exec_module(module) -_pkg_resources_warning_re = ( - "pkg_resources is deprecated as an API" -) +_pkg_resources_warning_re = "pkg_resources is deprecated as an API" -_pkg_resources_declare_warning_re = ( - r"Deprecated call to `pkg_resources\.declare_namespace.*" -) +_pkg_resources_declare_warning_re = r"Deprecated call to `pkg_resources\.declare_namespace.*" -_numpy_warning_re = ( - "numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192" -) +_numpy_warning_re = "numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192" -_distutils_warning_re = ( - "distutils Version classes are deprecated. Use packaging.version instead." -) +_distutils_warning_re = "distutils Version classes are deprecated. Use packaging.version instead." _experimental_warning_re = ( "The ZarrIO backend is experimental. It is under active development. " @@ -50,8 +42,8 @@ def _import_from_file(script): ) _deprecation_warning_map = ( - 'Classes in map.py should be imported from hdmf.build. Importing from hdmf.build.map will be removed ' - 'in HDMF 3.0.' + "Classes in map.py should be imported from hdmf.build. Importing from hdmf.build.map will be removed " + "in HDMF 3.0." ) _deprecation_warning_fmt_docval_args = ( @@ -69,22 +61,17 @@ def _import_from_file(script): "is set), then you will need to pop the extra arguments out of kwargs before calling the function." ) -_deprecation_warning_pandas_pyarrow_re = ( - r"\nPyarrow will become a required dependency of pandas.*" -) +_deprecation_warning_pandas_pyarrow_re = r"\nPyarrow will become a required dependency of pandas.*" -_deprecation_warning_datetime = ( - r"datetime.datetime.utcfromtimestamp() *" -) +_deprecation_warning_datetime = r"datetime.datetime.utcfromtimestamp() *" -_deprecation_warning_zarr_store = ( - r"The NestedDirectoryStore is deprecated *" -) +_deprecation_warning_zarr_store = r"The NestedDirectoryStore is deprecated *" _deprecation_warning_numpy = ( "__array__ implementation doesn't accept a copy keyword, so passing copy=False failed. " "__array__ must implement 'dtype' and 'copy' keyword arguments." ) + def run_gallery_tests(): global TOTAL, FAILURES, ERRORS logging.info("Testing execution of Sphinx Gallery files") @@ -92,7 +79,7 @@ def run_gallery_tests(): # get all python file names in docs/gallery gallery_file_names = list() for root, _, files in os.walk( - os.path.join(os.path.dirname(__file__), "docs", "gallery") + os.path.join(os.path.dirname(__file__), "docs", "gallery"), ): for f in files: if f.endswith(".py"): @@ -105,57 +92,59 @@ def run_gallery_tests(): for script in gallery_file_names: logging.info("Executing %s" % script) os.chdir(curr_dir) # Reset the working directory - script_abs = os.path.abspath(script) # Determine the full path of the script + script_abs = os.path.abspath(script) # Determine the full path of the script # Set the working dir to be relative to the script to allow the use of relative file paths in the scripts os.chdir(os.path.dirname(script_abs)) try: with warnings.catch_warnings(record=True): - warnings.filterwarnings( - "ignore", message=_deprecation_warning_map, category=DeprecationWarning - ) + warnings.filterwarnings("ignore", message=_deprecation_warning_map, category=DeprecationWarning) warnings.filterwarnings( "ignore", message=_deprecation_warning_fmt_docval_args, category=PendingDeprecationWarning ) warnings.filterwarnings( "ignore", message=_deprecation_warning_call_docval_func, category=PendingDeprecationWarning ) - warnings.filterwarnings( - "ignore", message=_experimental_warning_re, category=UserWarning - ) - warnings.filterwarnings( - "ignore", message=_user_warning_transpose, category=UserWarning - ) + warnings.filterwarnings("ignore", message=_experimental_warning_re, category=UserWarning) + warnings.filterwarnings("ignore", message=_user_warning_transpose, category=UserWarning) warnings.filterwarnings( # this warning is triggered from pandas when HDMF is installed with the minimum requirements - "ignore", message=_distutils_warning_re, category=DeprecationWarning + "ignore", + message=_distutils_warning_re, + category=DeprecationWarning, ) warnings.filterwarnings( # this warning is triggered when some numpy extension code in an upstream package was compiled # against a different version of numpy than the one installed - "ignore", message=_numpy_warning_re, category=RuntimeWarning + "ignore", + message=_numpy_warning_re, + category=RuntimeWarning, ) warnings.filterwarnings( # this warning is triggered when downstream code such as pynwb uses pkg_resources>=5.13 - "ignore", message=_pkg_resources_warning_re, category=DeprecationWarning + "ignore", + message=_pkg_resources_warning_re, + category=DeprecationWarning, ) warnings.filterwarnings( # this warning is triggered when downstream code such as pynwb uses pkg_resources>=5.13 - "ignore", message=_pkg_resources_declare_warning_re, category=DeprecationWarning + "ignore", + message=_pkg_resources_declare_warning_re, + category=DeprecationWarning, ) warnings.filterwarnings( # this warning is triggered from pandas - "ignore", message=_deprecation_warning_pandas_pyarrow_re, category=DeprecationWarning + "ignore", + message=_deprecation_warning_pandas_pyarrow_re, + category=DeprecationWarning, ) warnings.filterwarnings( # this is triggered from datetime - "ignore", message=_deprecation_warning_datetime, category=DeprecationWarning - ) - warnings.filterwarnings( - "ignore", message=_deprecation_warning_zarr_store, category=FutureWarning - ) - warnings.filterwarnings( - "ignore", message=_deprecation_warning_numpy, category=DeprecationWarning + "ignore", + message=_deprecation_warning_datetime, + category=DeprecationWarning, ) + warnings.filterwarnings("ignore", message=_deprecation_warning_zarr_store, category=FutureWarning) + warnings.filterwarnings("ignore", message=_deprecation_warning_numpy, category=DeprecationWarning) _import_from_file(script_abs) except Exception: print(traceback.format_exc()) diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index ddfe9dc5..c30cf482 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -3,6 +3,7 @@ The actual tests are then instantiated with various different backends in the test_zarrio.py module.""" + import unittest import os import numpy as np @@ -13,31 +14,23 @@ import zarr from hdmf_zarr.backend import ZarrIO from hdmf_zarr.utils import ZarrDataIO, ZarrReference -from tests.unit.utils import (Baz, BazData, BazBucket, get_baz_buildmanager) +from tests.unit.utils import Baz, BazData, BazBucket, get_baz_buildmanager # Try to import numcodecs and disable compression tests if it is not available try: from numcodecs import Blosc, Delta, JSON + DISABLE_ZARR_COMPRESSION_TESTS = False except ImportError: DISABLE_ZARR_COMPRESSION_TESTS = True from hdmf.spec.namespace import NamespaceCatalog -from hdmf.build import (GroupBuilder, - DatasetBuilder, - LinkBuilder, - ReferenceBuilder, - OrphanContainerBuildError) +from hdmf.build import GroupBuilder, DatasetBuilder, LinkBuilder, ReferenceBuilder, OrphanContainerBuildError from hdmf.data_utils import DataChunkIterator from hdmf.testing import TestCase -from hdmf.backends.io import (HDMFIO, - UnsupportedOperation) +from hdmf.backends.io import HDMFIO, UnsupportedOperation -from tests.unit.utils import (Foo, - FooBucket, - FooFile, - get_foo_buildmanager, - CacheSpecTestHelper) +from tests.unit.utils import Foo, FooBucket, FooFile, get_foo_buildmanager, CacheSpecTestHelper from abc import ABCMeta, abstractmethod @@ -75,7 +68,7 @@ def tearDown(self): """ Remove all files and folders defined by self.store_path """ - paths = self.store_path if isinstance(self.store_path, list) else [self.store_path, ] + paths = self.store_path if isinstance(self.store_path, list) else [self.store_path] for path in paths: if os.path.exists(path): if os.path.isdir(path): @@ -91,6 +84,7 @@ class ZarrStoreTestCase(TestCase): Class that creates a zarr file containing groups, datasets, and references for general purpose testing. """ + def setUp(self): self.store_path = "test_io.zarr" @@ -104,22 +98,22 @@ def tearDown(self): def createReferenceBuilder(self): data_1 = np.arange(100, 200, 10).reshape(2, 5) data_2 = np.arange(0, 200, 10).reshape(4, 5) - dataset_1 = DatasetBuilder('dataset_1', data_1) - dataset_2 = DatasetBuilder('dataset_2', data_2) + dataset_1 = DatasetBuilder("dataset_1", data_1) + dataset_2 = DatasetBuilder("dataset_2", data_2) ref_dataset_1 = ReferenceBuilder(dataset_1) ref_dataset_2 = ReferenceBuilder(dataset_2) ref_data = [ref_dataset_1, ref_dataset_2] - dataset_ref = DatasetBuilder('ref_dataset', ref_data, dtype='object') + dataset_ref = DatasetBuilder("ref_dataset", ref_data, dtype="object") - builder = GroupBuilder('root', - source=self.store_path, - datasets={'dataset_1': dataset_1, - 'dataset_2': dataset_2, - 'ref_dataset': dataset_ref}) + builder = GroupBuilder( + name="root", + source=self.store_path, + datasets={"dataset_1": dataset_1, "dataset_2": dataset_2, "ref_dataset": dataset_ref}, + ) return builder - def create_zarr(self, consolidate_metadata=True, force_overwrite=False, mode='a'): + def create_zarr(self, consolidate_metadata=True, force_overwrite=False, mode="a"): builder = self.createReferenceBuilder() writer = ZarrIO(self.store_path, mode=mode, force_overwrite=force_overwrite) writer.write_builder(builder, consolidate_metadata) @@ -151,85 +145,91 @@ def setUp(self): self.store_path = "test_io.zarr" def createGroupBuilder(self): - self.foo_builder = GroupBuilder('foo1', - attributes={'data_type': 'Foo', - 'namespace': 'test_core', - 'attr1': 17.5}, - datasets={'my_data': self.__dataset_builder}) + self.foo_builder = GroupBuilder( + "foo1", + attributes={"data_type": "Foo", "namespace": "test_core", "attr1": 17.5}, + datasets={"my_data": self.__dataset_builder}, + ) # self.foo = Foo('foo1', self.__dataset_builder.data, attr1="bar", attr2=17, attr3=3.14) # self.manager.prebuilt(self.foo, self.foo_builder) self.builder = GroupBuilder( - 'root', + name="root", source=self.store_path, - groups={'test_bucket': - GroupBuilder('test_bucket', - groups={'foo_holder': - GroupBuilder('foo_holder', - groups={'foo1': self.foo_builder})})}, - attributes={'data_type': 'FooFile'}) + groups={ + "test_bucket": GroupBuilder( + name="test_bucket", + groups={ + "foo_holder": GroupBuilder(name="foo_holder", groups={"foo1": self.foo_builder}), + }, + ) + }, + attributes={"data_type": "FooFile"}, + ) def createReferenceBuilder(self): data_1 = np.arange(100, 200, 10).reshape(2, 5) data_2 = np.arange(0, 200, 10).reshape(4, 5) - dataset_1 = DatasetBuilder('dataset_1', data_1) - dataset_2 = DatasetBuilder('dataset_2', data_2) + dataset_1 = DatasetBuilder("dataset_1", data_1) + dataset_2 = DatasetBuilder("dataset_2", data_2) ref_dataset_1 = ReferenceBuilder(dataset_1) ref_dataset_2 = ReferenceBuilder(dataset_2) ref_data = [ref_dataset_1, ref_dataset_2] - dataset_ref = DatasetBuilder('ref_dataset', ref_data, dtype='object') + dataset_ref = DatasetBuilder("ref_dataset", ref_data, dtype="object") - builder = GroupBuilder('root', - source=self.store_path, - datasets={'dataset_1': dataset_1, - 'dataset_2': dataset_2, - 'ref_dataset': dataset_ref}) + builder = GroupBuilder( + name="root", + source=self.store_path, + datasets={"dataset_1": dataset_1, "dataset_2": dataset_2, "ref_dataset": dataset_ref}, + ) return builder def createReferenceCompoundBuilder(self): data_1 = np.arange(100, 200, 10).reshape(2, 5) data_2 = np.arange(0, 200, 10).reshape(4, 5) - dataset_1 = DatasetBuilder('dataset_1', data_1) - dataset_2 = DatasetBuilder('dataset_2', data_2) + dataset_1 = DatasetBuilder("dataset_1", data_1) + dataset_2 = DatasetBuilder("dataset_2", data_2) ref_dataset_1 = ReferenceBuilder(dataset_1) ref_dataset_2 = ReferenceBuilder(dataset_2) ref_data = [ - (1, 'dataset_1', ref_dataset_1), - (2, 'dataset_2', ref_dataset_2) + (1, "dataset_1", ref_dataset_1), + (2, "dataset_2", ref_dataset_2), + ] + ref_data_type = [ + {"name": "id", "dtype": "int"}, + {"name": "name", "dtype": str}, + {"name": "reference", "dtype": "object"}, ] - ref_data_type = [{'name': 'id', 'dtype': 'int'}, - {'name': 'name', 'dtype': str}, - {'name': 'reference', 'dtype': 'object'}] - dataset_ref = DatasetBuilder('ref_dataset', ref_data, dtype=ref_data_type) - builder = GroupBuilder('root', - source=self.store_path, - datasets={'dataset_1': dataset_1, - 'dataset_2': dataset_2, - 'ref_dataset': dataset_ref}) + dataset_ref = DatasetBuilder("ref_dataset", ref_data, dtype=ref_data_type) + builder = GroupBuilder( + name="root", + source=self.store_path, + datasets={"dataset_1": dataset_1, "dataset_2": dataset_2, "ref_dataset": dataset_ref}, + ) return builder def test_cannot_read(self): assert not ZarrIO.can_read("incorrect_path") def read_test_dataset(self): - reader = ZarrIO(self.store_path, manager=self.manager, mode='r') + reader = ZarrIO(self.store_path, manager=self.manager, mode="r") self.root = reader.read_builder() - dataset = self.root['test_bucket/foo_holder/foo1/my_data'] + dataset = self.root["test_bucket/foo_holder/foo1/my_data"] return dataset def read(self): - reader = ZarrIO(self.store_path, manager=self.manager, mode='r') + reader = ZarrIO(self.store_path, manager=self.manager, mode="r") self.root = reader.read_builder() def test_cache_spec(self): - tempIO = ZarrIO(self.store_path, manager=self.manager, mode='w') + tempIO = ZarrIO(self.store_path, manager=self.manager, mode="w") # Setup all the data we need - foo1 = Foo('foo1', [0, 1, 2, 3, 4], "I am foo1", 17, 3.14) - foo2 = Foo('foo2', [5, 6, 7, 8, 9], "I am foo2", 34, 6.28) - foobucket = FooBucket('test_bucket', [foo1, foo2]) + foo1 = Foo("foo1", [0, 1, 2, 3, 4], "I am foo1", 17, 3.14) + foo2 = Foo("foo2", [5, 6, 7, 8, 9], "I am foo2", 34, 6.28) + foobucket = FooBucket("test_bucket", [foo1, foo2]) foofile = FooFile(buckets=[foobucket]) # Write the first file @@ -239,16 +239,16 @@ def test_cache_spec(self): # Load the spec and assert that it is valid ns_catalog = NamespaceCatalog() ZarrIO.load_namespaces(ns_catalog, self.store_path) - self.assertEqual(ns_catalog.namespaces, ('test_core',)) + self.assertEqual(ns_catalog.namespaces, ("test_core",)) source_types = CacheSpecTestHelper.get_types(self.manager.namespace_catalog) read_types = CacheSpecTestHelper.get_types(ns_catalog) self.assertSetEqual(source_types, read_types) def test_write_int(self, test_data=None): data = np.arange(100, 200, 10).reshape(2, 5) if test_data is None else test_data - self.__dataset_builder = DatasetBuilder('my_data', data, attributes={'attr2': 17}) + self.__dataset_builder = DatasetBuilder("my_data", data, attributes={"attr2": 17}) self.createGroupBuilder() - writer = ZarrIO(self.store_path, manager=self.manager, mode='a') + writer = ZarrIO(self.store_path, manager=self.manager, mode="a") writer.write_builder(self.builder) writer.close() assert ZarrIO.can_read(self.store_path) @@ -259,73 +259,70 @@ def test_write_compound(self, test_data=None): each tuple consists of an int and a string :return: """ - data = [(1, 'Allen'), - (2, 'Bob'), - (3, 'Mike'), - (4, 'Jenny')] if test_data is None else test_data - data_type = [{'name': 'id', 'dtype': 'int'}, - {'name': 'name', 'dtype': str}] - self.__dataset_builder = DatasetBuilder('my_data', data, dtype=data_type) + if test_data is None: + test_data = [(1, "Allen"), (2, "Bob"), (3, "Mike"), (4, "Jenny")] + data_type = [{"name": "id", "dtype": "int"}, {"name": "name", "dtype": str}] + self.__dataset_builder = DatasetBuilder("my_data", test_data, dtype=data_type) self.createGroupBuilder() - writer = ZarrIO(self.store_path, manager=self.manager, mode='w') + writer = ZarrIO(self.store_path, manager=self.manager, mode="w") writer.write_builder(self.builder) writer.close() def test_write_chunk(self, test_data=None): - data = np.arange(100, 200, 10).reshape(2, 5) if test_data is None else test_data - data_io = ZarrDataIO(data=data, chunks=(1, 5), fillvalue=-1) - self.__dataset_builder = DatasetBuilder('my_data', data_io, attributes={'attr2': 17}) + if test_data is None: + test_data = np.arange(100, 200, 10).reshape(2, 5) + data_io = ZarrDataIO(data=test_data, chunks=(1, 5), fillvalue=-1) + self.__dataset_builder = DatasetBuilder("my_data", data_io, attributes={"attr2": 17}) self.createGroupBuilder() - writer = ZarrIO(self.store_path, manager=self.manager, mode='a') + writer = ZarrIO(self.store_path, manager=self.manager, mode="a") writer.write_builder(self.builder) writer.close() def test_write_strings(self, test_data=None): - data = [['a', 'aa', 'aaa', 'aaaa', 'aaaaa'], - ['b', 'bb', 'bbb', 'bbbb', 'bbbbb']] if test_data is None else test_data - self.__dataset_builder = DatasetBuilder('my_data', data, attributes={'attr2': 17}) + if test_data is None: + test_data = [["a", "aa", "aaa", "aaaa", "aaaaa"], ["b", "bb", "bbb", "bbbb", "bbbbb"]] + self.__dataset_builder = DatasetBuilder("my_data", test_data, attributes={"attr2": 17}) self.createGroupBuilder() - writer = ZarrIO(self.store_path, manager=self.manager, mode='a') + writer = ZarrIO(self.store_path, manager=self.manager, mode="a") writer.write_builder(self.builder) writer.close() def test_write_links(self, test_data=None): - data = np.arange(100, 200, 10).reshape(2, 5) if test_data is None else test_data - self.__dataset_builder = DatasetBuilder('my_data', data, attributes={'attr2': 17}) + if test_data is None: + test_data = np.arange(100, 200, 10).reshape(2, 5) + self.__dataset_builder = DatasetBuilder("my_data", test_data, attributes={"attr2": 17}) self.createGroupBuilder() - link_parent = self.builder['test_bucket'] - link_parent.set_link(LinkBuilder(self.foo_builder, 'my_link')) - link_parent.set_link(LinkBuilder(self.__dataset_builder, 'my_dataset')) - writer = ZarrIO(self.store_path, manager=self.manager, mode='a') + link_parent = self.builder["test_bucket"] + link_parent.set_link(LinkBuilder(self.foo_builder, "my_link")) + link_parent.set_link(LinkBuilder(self.__dataset_builder, "my_dataset")) + writer = ZarrIO(self.store_path, manager=self.manager, mode="a") writer.write_builder(self.builder) writer.close() def test_write_link_array(self): data = np.arange(100, 200, 10).reshape(2, 5) - self.__dataset_builder = DatasetBuilder('my_data', data, attributes={'attr2': 17}) + self.__dataset_builder = DatasetBuilder("my_data", data, attributes={"attr2": 17}) self.createGroupBuilder() - writer = ZarrIO(self.store_path, manager=self.manager, mode='a') + writer = ZarrIO(self.store_path, manager=self.manager, mode="a") writer.write_builder(self.builder) - zarr_file = zarr.open(self.store_path, mode='r') + zarr_file = zarr.open(self.store_path, mode="r") zarr_array = zarr_file["/test_bucket/foo_holder/foo1/my_data"] link_io = ZarrDataIO(data=zarr_array, link_data=True) - link_dataset = DatasetBuilder('dataset_link', link_io) - self.builder['test_bucket'].set_dataset(link_dataset) + link_dataset = DatasetBuilder("dataset_link", link_io) + self.builder["test_bucket"].set_dataset(link_dataset) writer.write_builder(self.builder) writer.close() - reader = ZarrIO(self.store_path, manager=self.manager, mode='r') + reader = ZarrIO(self.store_path, manager=self.manager, mode="r") self.root = reader.read_builder() - read_link = self.root['test_bucket/dataset_link'] - read_link_data = read_link['builder']['data'][:] + read_link = self.root["test_bucket/dataset_link"] + read_link_data = read_link["builder"]["data"][:] self.assertTrue(np.all(data == read_link_data)) reader.close() def test_write_reference(self): builder = self.createReferenceBuilder() - writer = ZarrIO(self.store_path, - manager=self.manager, - mode='a') + writer = ZarrIO(self.store_path, manager=self.manager, mode="a") writer.write_builder(builder) writer.close() @@ -334,63 +331,62 @@ def test_write_references_roundtrip(self): num_bazs = 1 bazs = [] # set up dataset of references for i in range(num_bazs): - bazs.append(Baz(name='baz%d' % i)) - baz_data = BazData(name='baz_data', data=bazs) + bazs.append(Baz(name="baz%d" % i)) + baz_data = BazData(name="baz_data", data=bazs) container = BazBucket(bazs=bazs, baz_data=baz_data) manager = get_baz_buildmanager() # write to file - with ZarrIO(self.store_path, manager=manager, mode='w') as writer: + with ZarrIO(self.store_path, manager=manager, mode="w") as writer: writer.write(container=container) # read from file and validate references - with ZarrIO(self.store_path, manager=manager, mode='r') as reader: + with ZarrIO(self.store_path, manager=manager, mode="r") as reader: read_container = reader.read() for i in range(num_bazs): - baz_name = 'baz%d' % i + baz_name = "baz%d" % i expected_container = read_container.bazs[baz_name] - expected_value = {'source': 'test_io.zarr', - 'path': '/bazs/' + baz_name, - 'object_id': expected_container.object_id, - 'source_object_id': read_container.object_id} + expected_value = { + "source": "test_io.zarr", + "path": "/bazs/" + baz_name, + "object_id": expected_container.object_id, + "source_object_id": read_container.object_id, + } # Read the dict with the definition of the reference from the raw Zarr file and compare # to also check that reference (included object id's) are defined correctly - self.assertDictEqual(reader.file['baz_data'][i], expected_value) + self.assertDictEqual(reader.file["baz_data"][i], expected_value) # Also test using the low-level reference functions zarr_ref = ZarrReference(**expected_value) # Check the ZarrReference first - self.assertEqual(zarr_ref.object_id, expected_value['object_id']) - self.assertEqual(zarr_ref.source_object_id, expected_value['source_object_id']) + self.assertEqual(zarr_ref.object_id, expected_value["object_id"]) + self.assertEqual(zarr_ref.source_object_id, expected_value["source_object_id"]) def test_write_reference_compound(self): builder = self.createReferenceCompoundBuilder() - writer = ZarrIO(self.store_path, manager=self.manager, mode='a') + writer = ZarrIO(self.store_path, manager=self.manager, mode="a") writer.write_builder(builder) writer.close() def test_read_int(self): test_data = np.arange(100, 200, 10).reshape(5, 2) self.test_write_int(test_data=test_data) - dataset = self.read_test_dataset()['data'][:] + dataset = self.read_test_dataset()["data"][:] self.assertTrue(np.all(test_data == dataset)) def test_read_chunk(self): test_data = np.arange(100, 200, 10).reshape(5, 2) self.test_write_chunk(test_data=test_data) - dataset = self.read_test_dataset()['data'][:] + dataset = self.read_test_dataset()["data"][:] self.assertTrue(np.all(test_data == dataset)) def test_read_strings(self): - test_data = [['a1', 'aa2', 'aaa3', 'aaaa4', 'aaaaa5'], - ['b1', 'bb2', 'bbb3', 'bbbb4', 'bbbbb5']] + test_data = [["a1", "aa2", "aaa3", "aaaa4", "aaaaa5"], ["b1", "bb2", "bbb3", "bbbb4", "bbbbb5"]] self.test_write_strings(test_data=test_data) - dataset = self.read_test_dataset()['data'][:] + dataset = self.read_test_dataset()["data"][:] self.assertTrue(np.all(np.asarray(test_data) == dataset)) def test_read_compound(self): - test_data = [(1, 'Allen1'), - (2, 'Bob1'), - (3, 'Mike1')] + test_data = [(1, "Allen1"), (2, "Bob1"), (3, "Mike1")] self.test_write_compound(test_data=test_data) - dataset = self.read_test_dataset()['data'] + dataset = self.read_test_dataset()["data"] self.assertTupleEqual(test_data[0], tuple(dataset[0])) self.assertTupleEqual(test_data[1], tuple(dataset[1])) self.assertTupleEqual(test_data[2], tuple(dataset[2])) @@ -399,81 +395,89 @@ def test_read_link(self): test_data = np.arange(100, 200, 10).reshape(5, 2) self.test_write_links(test_data=test_data) self.read() - link_data = self.root['test_bucket'].links['my_dataset'].builder.data[()] + link_data = self.root["test_bucket"].links["my_dataset"].builder.data[()] self.assertTrue(np.all(np.asarray(test_data) == link_data)) # print(self.root['test_bucket'].links['my_dataset'].builder.data[()]) def test_read_link_buf(self): data = np.arange(100, 200, 10).reshape(2, 5) - self.__dataset_builder = DatasetBuilder('my_data', data, attributes={'attr2': 17}) + self.__dataset_builder = DatasetBuilder("my_data", data, attributes={"attr2": 17}) self.createGroupBuilder() - link_parent_1 = self.builder['test_bucket'] - link_parent_2 = self.builder['test_bucket/foo_holder'] - link_parent_1.set_link(LinkBuilder(self.__dataset_builder, 'my_dataset_1')) - link_parent_2.set_link(LinkBuilder(self.__dataset_builder, 'my_dataset_2')) - writer = ZarrIO(self.store_path, manager=self.manager, mode='a') + link_parent_1 = self.builder["test_bucket"] + link_parent_2 = self.builder["test_bucket/foo_holder"] + link_parent_1.set_link(LinkBuilder(self.__dataset_builder, "my_dataset_1")) + link_parent_2.set_link(LinkBuilder(self.__dataset_builder, "my_dataset_2")) + writer = ZarrIO(self.store_path, manager=self.manager, mode="a") writer.write_builder(self.builder) writer.close() self.read() - self.assertTrue(self.root['test_bucket'].links['my_dataset_1'].builder == - self.root['test_bucket/foo_holder'].links['my_dataset_2'].builder) + self.assertTrue( + self.root["test_bucket"].links["my_dataset_1"].builder + == self.root["test_bucket/foo_holder"].links["my_dataset_2"].builder + ) def test_read_reference(self): self.test_write_reference() self.read() - builder = self.createReferenceBuilder()['ref_dataset'] - read_builder = self.root['ref_dataset'] + builder = self.createReferenceBuilder()["ref_dataset"] + read_builder = self.root["ref_dataset"] # Load the linked arrays and confirm we get the same data as we had in the original builder - for i, v in enumerate(read_builder['data']): - self.assertTrue(np.all(builder['data'][i]['builder']['data'] == v['data'][:])) + for i, v in enumerate(read_builder["data"]): + self.assertTrue(np.all(builder["data"][i]["builder"]["data"] == v["data"][:])) def test_read_reference_compound(self): self.test_write_reference_compound() self.read() - builder = self.createReferenceCompoundBuilder()['ref_dataset'] - read_builder = self.root['ref_dataset'] + builder = self.createReferenceCompoundBuilder()["ref_dataset"] + read_builder = self.root["ref_dataset"] # ensure the array was written as a compound array - ref_dtype = np.dtype([('id', ' np.ndarray: def test_parallel_write(tmpdir): number_of_jobs = 2 - data = np.array([1., 2., 3.]) + data = np.array([1.0, 2.0, 3.0]) column = VectorData(name="TestColumn", description="", data=PickleableDataChunkIterator(data=data)) dynamic_table = DynamicTable(name="TestTable", description="", id=list(range(3)), columns=[column]) zarr_top_level_path = str(tmpdir / "test_parallel_write.zarr") - with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: + with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: io.write(container=dynamic_table, number_of_jobs=number_of_jobs) with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="r") as io: @@ -96,22 +98,26 @@ def test_parallel_write(tmpdir): def test_mixed_iterator_types(tmpdir): number_of_jobs = 2 - generic_iterator_data = np.array([1., 2., 3.]) + generic_iterator_data = np.array([1.0, 2.0, 3.0]) generic_iterator_column = VectorData( name="TestGenericIteratorColumn", description="", - data=PickleableDataChunkIterator(data=generic_iterator_data) + data=PickleableDataChunkIterator(data=generic_iterator_data), ) - classic_iterator_data = np.array([4., 5., 6.]) + classic_iterator_data = np.array([4.0, 5.0, 6.0]) classic_iterator_column = VectorData( name="TestClassicIteratorColumn", description="", - data=DataChunkIterator(data=classic_iterator_data) + data=DataChunkIterator(data=classic_iterator_data), ) - unwrappped_data = np.array([7., 8., 9.]) - unwrapped_column = VectorData(name="TestUnwrappedColumn", description="", data=unwrappped_data) + unwrappped_data = np.array([7.0, 8.0, 9.0]) + unwrapped_column = VectorData( + name="TestUnwrappedColumn", + description="", + data=unwrappped_data, + ) dynamic_table = DynamicTable( name="TestTable", description="", @@ -120,7 +126,7 @@ def test_mixed_iterator_types(tmpdir): ) zarr_top_level_path = str(tmpdir / "test_mixed_iterator_types.zarr") - with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: + with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: io.write(container=dynamic_table, number_of_jobs=number_of_jobs) with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="r") as io: @@ -138,18 +144,18 @@ def test_mixed_iterator_types(tmpdir): def test_mixed_iterator_pickleability(tmpdir): number_of_jobs = 2 - pickleable_iterator_data = np.array([1., 2., 3.]) + pickleable_iterator_data = np.array([1.0, 2.0, 3.0]) pickleable_iterator_column = VectorData( name="TestGenericIteratorColumn", description="", - data=PickleableDataChunkIterator(data=pickleable_iterator_data) + data=PickleableDataChunkIterator(data=pickleable_iterator_data), ) - not_pickleable_iterator_data = np.array([4., 5., 6.]) + not_pickleable_iterator_data = np.array([4.0, 5.0, 6.0]) not_pickleable_iterator_column = VectorData( name="TestClassicIteratorColumn", description="", - data=NotPickleableDataChunkIterator(data=not_pickleable_iterator_data) + data=NotPickleableDataChunkIterator(data=not_pickleable_iterator_data), ) dynamic_table = DynamicTable( @@ -160,7 +166,7 @@ def test_mixed_iterator_pickleability(tmpdir): ) zarr_top_level_path = str(tmpdir / "test_mixed_iterator_pickleability.zarr") - with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: + with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: io.write(container=dynamic_table, number_of_jobs=number_of_jobs) with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="r") as io: @@ -180,20 +186,20 @@ def test_simple_tqdm(tmpdir): zarr_top_level_path = str(tmpdir / "test_simple_tqdm.zarr") with patch("sys.stderr", new=StringIO()) as tqdm_out: - with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: + with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: column = VectorData( name="TestColumn", description="", data=PickleableDataChunkIterator( - data=np.array([1., 2., 3.]), + data=np.array([1.0, 2.0, 3.0]), display_progress=True, - ) + ), ) dynamic_table = DynamicTable( name="TestTable", description="", columns=[column], - id=list(range(3)) # must provide id's when all columns are iterators + id=list(range(3)), # must provide id's when all columns are iterators ) io.write(container=dynamic_table, number_of_jobs=number_of_jobs) @@ -208,29 +214,29 @@ def test_compound_tqdm(tmpdir): zarr_top_level_path = str(tmpdir / "test_compound_tqdm.zarr") with patch("sys.stderr", new=StringIO()) as tqdm_out: - with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: + with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: pickleable_column = VectorData( name="TestPickleableIteratorColumn", description="", data=PickleableDataChunkIterator( - data=np.array([1., 2., 3.]), + data=np.array([1.0, 2.0, 3.0]), display_progress=True, - ) + ), ) not_pickleable_column = VectorData( name="TestNotPickleableColumn", description="", data=NotPickleableDataChunkIterator( - data=np.array([4., 5., 6.]), + data=np.array([4.0, 5.0, 6.0]), display_progress=True, - progress_bar_options=dict(desc=expected_desc_not_pickleable, position=1) - ) + progress_bar_options=dict(desc=expected_desc_not_pickleable, position=1), + ), ) dynamic_table = DynamicTable( name="TestTable", description="", columns=[pickleable_column, not_pickleable_column], - id=list(range(3)) # must provide id's when all columns are iterators + id=list(range(3)), # must provide id's when all columns are iterators ) io.write(container=dynamic_table, number_of_jobs=number_of_jobs) @@ -242,7 +248,7 @@ def test_compound_tqdm(tmpdir): def test_extra_keyword_argument_propagation(tmpdir): number_of_jobs = 2 - column = VectorData(name="TestColumn", description="", data=np.array([1., 2., 3.])) + column = VectorData(name="TestColumn", description="", data=np.array([1.0, 2.0, 3.0])) dynamic_table = DynamicTable(name="TestTable", description="", id=list(range(3)), columns=[column]) zarr_top_level_path = str(tmpdir / "test_extra_parallel_write_keyword_arguments.zarr") @@ -263,12 +269,12 @@ def test_extra_keyword_argument_propagation(tmpdir): for test_keyword_argument_pair in test_keyword_argument_pairs: test_max_threads_per_process = test_keyword_argument_pair["max_threads_per_process"] test_multiprocessing_context = test_keyword_argument_pair["multiprocessing_context"] - with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: + with ZarrIO(path=zarr_top_level_path, manager=get_manager(), mode="w") as io: io.write( container=dynamic_table, number_of_jobs=number_of_jobs, max_threads_per_process=test_max_threads_per_process, - multiprocessing_context=test_multiprocessing_context + multiprocessing_context=test_multiprocessing_context, ) assert io._ZarrIO__dci_queue.max_threads_per_process == test_max_threads_per_process diff --git a/tests/unit/test_zarrdataio.py b/tests/unit/test_zarrdataio.py index f52f2f5c..963308e3 100644 --- a/tests/unit/test_zarrdataio.py +++ b/tests/unit/test_zarrdataio.py @@ -6,6 +6,7 @@ more complex operations and are more akin to integration tests This module focuses on test for specific unit functions of ZarrDataIO. """ + import numcodecs import h5py import os @@ -16,6 +17,7 @@ try: import hdf5plugin + HDF5PLUGIN = True except ImportError: HDF5PLUGIN = False @@ -23,8 +25,10 @@ from hdmf_zarr.utils import ZarrDataIO from tests.unit.utils import get_temp_filepath + class TestZarrDataIO(TestCase): """Test the ZarrDataIO class""" + def setUp(self): self.hdf_filename = get_temp_filepath() self.zarr_filename = get_temp_filepath() @@ -41,8 +45,8 @@ def tearDown(self): def test_hdf5_to_zarr_filters_scaleoffset(self): """Test that we warn when the scaleoffset filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') - h5dset = h5file.create_dataset(name='test_dset', data=[1,2,3,4,5], scaleoffset=10) + h5file = h5py.File(self.hdf_filename, mode="a") + h5dset = h5file.create_dataset(name="test_dset", data=[1, 2, 3, 4, 5], scaleoffset=10) # test that we warn due to the scaleoffset msg = "/test_dset HDF5 scaleoffset filter ignored in Zarr" with self.assertWarnsWith(UserWarning, msg): @@ -54,8 +58,8 @@ def test_hdf5_to_zarr_filters_scaleoffset(self): def test_hdf5_to_zarr_filters_lzf(self): """Test that we warn when the lzf filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') - h5dset = h5file.create_dataset(name='test_dset', data=[1, 2, 3, 4, 5], compression="lzf") + h5file = h5py.File(self.hdf_filename, mode="a") + h5dset = h5file.create_dataset(name="test_dset", data=[1, 2, 3, 4, 5], compression="lzf") # test that we warn due to the scaleoffset msg = "/test_dset HDF5 szip or lzf compression ignored in Zarr" with self.assertWarnsWith(UserWarning, msg): @@ -68,10 +72,12 @@ def test_hdf5_to_zarr_filters_lzf(self): def test_hdf5_to_zarr_filters_lz4(self): """Test that we warn when the lz4 filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') - h5dset = h5file.create_dataset(name='test_dset', - data=[1, 2, 3, 4, 5], - **hdf5plugin.LZ4()) + h5file = h5py.File(self.hdf_filename, mode="a") + h5dset = h5file.create_dataset( + name="test_dset", + data=[1, 2, 3, 4, 5], + **hdf5plugin.LZ4(), + ) # test that we warn due to the scaleoffset msg = "/test_dset HDF5 lz4 compression ignored in Zarr" with self.assertWarnsWith(UserWarning, msg): @@ -84,10 +90,12 @@ def test_hdf5_to_zarr_filters_lz4(self): def test_hdf5_to_zarr_filters_bitshuffle(self): """Test that we warn when the bitshuffle filter is being used in HDF5 in ZarrDataIO.hdf5_to_zarr_filters.""" # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') - h5dset = h5file.create_dataset(name='test_dset', - data=[1, 2, 3, 4, 5], - **hdf5plugin.Bitshuffle(nelems=0, lz4=True)) + h5file = h5py.File(self.hdf_filename, mode="a") + h5dset = h5file.create_dataset( + name="test_dset", + data=[1, 2, 3, 4, 5], + **hdf5plugin.Bitshuffle(nelems=0, lz4=True), + ) # test that we warn due to the scaleoffset msg = "/test_dset HDF5 bitshuffle compression ignored in Zarr" with self.assertWarnsWith(UserWarning, msg): @@ -103,11 +111,12 @@ def test_hdf5_to_zarr_filters_other_unsupported(self): This test is to ensure that the catch-all at the end of the loop works. """ # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') + h5file = h5py.File(self.hdf_filename, mode="a") h5dset_FciDecomp = h5file.create_dataset( - name='test_fcidecomp', + name="test_fcidecomp", data=[1, 2, 3, 4, 5], - **hdf5plugin.FciDecomp()) + **hdf5plugin.FciDecomp(), + ) # test that we warn due to the FciDecomp msg = r"/test_fcidecomp HDF5 filter id 32018 with properties .* ignored in Zarr." with self.assertWarnsRegex(UserWarning, msg): @@ -119,9 +128,17 @@ def test_hdf5_to_zarr_filters_other_unsupported(self): def test_hdf5_to_zarr_filters_shuffle(self): """Test HDF5 shuffle filter works with ZarrDataIO.hdf5_to_zarr_filters.""" # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') - h5dset_int = h5file.create_dataset(name='test_int', data=np.arange(5, dtype='int32'), shuffle=True) - h5dset_float = h5file.create_dataset(name='test_float', data=np.arange(5, dtype='float32'), shuffle=True) + h5file = h5py.File(self.hdf_filename, mode="a") + h5dset_int = h5file.create_dataset( + name="test_int", + data=np.arange(5, dtype="int32"), + shuffle=True, + ) + h5dset_float = h5file.create_dataset( + name="test_float", + data=np.arange(5, dtype="float32"), + shuffle=True, + ) # test that we apply shuffle filter on int data filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset_int) self.assertEqual(len(filters), 1) @@ -136,17 +153,17 @@ def test_hdf5_to_zarr_filters_shuffle(self): def test_hdf5_to_zarr_filters_blosclz(self): """Test HDF5 blosclz filter works with ZarrDataIO.hdf5_to_zarr_filters.""" # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') + h5file = h5py.File(self.hdf_filename, mode="a") h5dset = h5file.create_dataset( - name='test_int', - data=np.arange(100, dtype='float32'), - **hdf5plugin.Blosc(cname='blosclz', clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE) + name="test_int", + data=np.arange(100, dtype="float32"), + **hdf5plugin.Blosc(cname="blosclz", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), ) # test that we apply shuffle filter on int data filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) self.assertEqual(len(filters), 1) self.assertIsInstance(filters[0], numcodecs.Blosc) - self.assertEqual(filters[0].cname, 'blosclz') + self.assertEqual(filters[0].cname, "blosclz") self.assertEqual(filters[0].clevel, 9) self.assertEqual(filters[0].shuffle, hdf5plugin.Blosc.SHUFFLE) h5file.close() @@ -155,11 +172,11 @@ def test_hdf5_to_zarr_filters_blosclz(self): def test_hdf5_to_zarr_filters_zstd(self): """Test HDF5 zstd filter works with ZarrDataIO.hdf5_to_zarr_filters.""" # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') + h5file = h5py.File(self.hdf_filename, mode="a") h5dset = h5file.create_dataset( - name='test_int', - data=np.arange(100, dtype='float32'), - **hdf5plugin.Zstd(clevel=22) + name="test_int", + data=np.arange(100, dtype="float32"), + **hdf5plugin.Zstd(clevel=22), ) # test that we apply shuffle filter on int data filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) @@ -172,12 +189,12 @@ def test_hdf5_to_zarr_filters_zstd(self): def test_hdf5_to_zarr_filters_gzip(self): """Test HDF5 gzip filter works with ZarrDataIO.hdf5_to_zarr_filters.""" # Create a test HDF5 dataset with scaleoffset - h5file = h5py.File(self.hdf_filename, mode='a') + h5file = h5py.File(self.hdf_filename, mode="a") h5dset = h5file.create_dataset( - name='test_int', - data=np.arange(100, dtype='float32'), - compression='gzip', - compression_opts=2 + name="test_int", + data=np.arange(100, dtype="float32"), + compression="gzip", + compression_opts=2, ) # test that we apply shuffle filter on int data filters = ZarrDataIO.hdf5_to_zarr_filters(h5dset) @@ -189,32 +206,33 @@ def test_hdf5_to_zarr_filters_gzip(self): def test_is_h5py_dataset(self): """Test ZarrDataIO.is_h5py_dataset""" - h5file = h5py.File(self.hdf_filename, mode='a') - arr=np.arange(10) - h5dset = h5file.create_dataset(name='test', data=arr) + h5file = h5py.File(self.hdf_filename, mode="a") + arr = np.arange(10) + h5dset = h5file.create_dataset(name="test", data=arr) self.assertTrue(ZarrDataIO.is_h5py_dataset(h5dset)) self.assertFalse(ZarrDataIO.is_h5py_dataset(arr)) def test_from_h5py_dataset(self): """Test ZarrDataIO.from_h5py_dataset""" - h5file = h5py.File(self.hdf_filename, mode='a') + h5file = h5py.File(self.hdf_filename, mode="a") h5dset = h5file.create_dataset( - name='test', - data=np.arange(1000).reshape((10,100)), - compression='gzip', + name="test", + data=np.arange(1000).reshape((10, 100)), + compression="gzip", compression_opts=6, shuffle=True, fillvalue=100, - chunks=(5,10)) + chunks=(5, 10), + ) re_zarrdataio = ZarrDataIO.from_h5py_dataset(h5dset) # Test that all settings are being presevered when creating the ZarrDataIO object self.assertIsInstance(re_zarrdataio, ZarrDataIO) self.assertEqual(re_zarrdataio.data, h5dset) self.assertEqual(re_zarrdataio.fillvalue, 100) - self.assertEqual(re_zarrdataio.chunks, (5,10)) - self.assertEqual(len(re_zarrdataio.io_settings['filters']), 2) - self.assertIsInstance(re_zarrdataio.io_settings['filters'][0], numcodecs.Shuffle) - self.assertIsInstance(re_zarrdataio.io_settings['filters'][1], numcodecs.Zlib) + self.assertEqual(re_zarrdataio.chunks, (5, 10)) + self.assertEqual(len(re_zarrdataio.io_settings["filters"]), 2) + self.assertIsInstance(re_zarrdataio.io_settings["filters"][0], numcodecs.Shuffle) + self.assertIsInstance(re_zarrdataio.io_settings["filters"][1], numcodecs.Zlib) # Close the HDF5 file h5file.close() @@ -223,15 +241,16 @@ def test_from_h5py_dataset_bytes_fillvalue(self): Test ZarrDataIO.from_h5py_dataset with a fillvalue that is in bytes, which needs to be handled separately since bytes are not JSON serializable by default """ - h5file = h5py.File(self.hdf_filename, mode='a') + h5file = h5py.File(self.hdf_filename, mode="a") # print(np.arange(10, dtype=np.int8).tobytes()) h5dset = h5file.create_dataset( - name='test_str', - data=[b'hello', b'world', b'go'], - fillvalue=b'None') + name="test_str", + data=[b"hello", b"world", b"go"], + fillvalue=b"None", + ) re_zarrdataio = ZarrDataIO.from_h5py_dataset(h5dset) # Test that all settings are being presevered when creating the ZarrDataIO object self.assertIsInstance(re_zarrdataio, ZarrDataIO) - self.assertEqual(re_zarrdataio.io_settings['fill_value'], str("None")) + self.assertEqual(re_zarrdataio.io_settings["fill_value"], str("None")) # Close the HDF5 file - h5file.close() \ No newline at end of file + h5file.close() diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index 50eb9130..0dfa2e35 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -9,13 +9,15 @@ classes will then be run here with all different backends so that we don't need to implement the tests separately for the different backends. """ -from tests.unit.base_tests_zarrio import (BaseTestZarrWriter, - ZarrStoreTestCase, - BaseTestZarrWriteUnit, - BaseTestExportZarrToZarr) -from zarr.storage import (DirectoryStore, - NestedDirectoryStore) -from tests.unit.utils import (Baz, BazData, BazBucket, get_baz_buildmanager) + +from tests.unit.base_tests_zarrio import ( + BaseTestZarrWriter, + ZarrStoreTestCase, + BaseTestZarrWriteUnit, + BaseTestExportZarrToZarr, +) +from zarr.storage import DirectoryStore, NestedDirectoryStore +from tests.unit.utils import Baz, BazData, BazBucket, get_baz_buildmanager import zarr from hdmf_zarr.backend import ZarrIO @@ -40,6 +42,7 @@ class TestZarrWriterDefaultStore(BaseTestZarrWriter): All settings are already defined in the BaseTestZarrWriter class so we here only need to instantiate the class to run the tests. """ + pass @@ -50,6 +53,7 @@ class TestZarrWriteUnitDefaultStore(BaseTestZarrWriteUnit): All settings are already defined in the BaseTestZarrWriter class so we here only need to instantiate the class to run the tests. """ + pass @@ -60,6 +64,7 @@ class TestExportZarrToZarrDefaultStore(BaseTestExportZarrToZarr): All settings are already defined in the BaseTestZarrWriter class so we here only need to instantiate the class to run the tests. """ + pass @@ -68,6 +73,7 @@ class TestExportZarrToZarrDefaultStore(BaseTestExportZarrToZarr): ######################################### class TestZarrWriterDirectoryStore(BaseTestZarrWriter): """Test writing of builder with Zarr using a custom DirectoryStore""" + def setUp(self): super().setUp() self.store = DirectoryStore(self.store_path) @@ -75,6 +81,7 @@ def setUp(self): class TestZarrWriteUnitDirectoryStore(BaseTestZarrWriteUnit): """Unit test for individual write functions using a custom DirectoryStore""" + def setUp(self): self.store_path = "test_io.zarr" self.store = DirectoryStore(self.store_path) @@ -82,6 +89,7 @@ def setUp(self): class TestExportZarrToZarrDirectoryStore(BaseTestExportZarrToZarr): """Test exporting Zarr to Zarr using DirectoryStore""" + def setUp(self): super().setUp() self.store = [DirectoryStore(p) for p in self.store_path] @@ -92,6 +100,7 @@ def setUp(self): ######################################### class TestZarrWriterNestedDirectoryStore(BaseTestZarrWriter): """Test writing of builder with Zarr using a custom NestedDirectoryStore""" + def setUp(self): super().setUp() self.store = NestedDirectoryStore(self.store_path) @@ -99,6 +108,7 @@ def setUp(self): class TestZarrWriteUnitNestedDirectoryStore(BaseTestZarrWriteUnit): """Unit test for individual write functions using a custom NestedDirectoryStore""" + def setUp(self): self.store_path = "test_io.zarr" self.store = NestedDirectoryStore(self.store_path) @@ -106,6 +116,7 @@ def setUp(self): class TestExportZarrToZarrNestedDirectoryStore(BaseTestExportZarrToZarr): """Test exporting Zarr to Zarr using NestedDirectoryStore""" + def setUp(self): super().setUp() self.store = [NestedDirectoryStore(p) for p in self.store_path] @@ -116,6 +127,7 @@ def setUp(self): ######################################### class TestPathlib(BaseTestZarrWriter): """Test writing of builder with Zarr using a custom DirectoryStore""" + def setUp(self): super().setUp() self.store = pathlib.Path(self.store_path) @@ -128,30 +140,31 @@ class TestConsolidateMetadata(ZarrStoreTestCase): """ Tests for consolidated metadata and corresponding helper methods. """ + def test_get_store_path_shallow(self): self.create_zarr(consolidate_metadata=False) store = DirectoryStore(self.store_path) path = ZarrIO._ZarrIO__get_store_path(store) - expected_path = os.path.abspath('test_io.zarr') + expected_path = os.path.abspath("test_io.zarr") self.assertEqual(path, expected_path) def test_get_store_path_deep(self): self.create_zarr() - zarr_obj = zarr.open_consolidated(self.store_path, mode='r') + zarr_obj = zarr.open_consolidated(self.store_path, mode="r") store = zarr_obj.store path = ZarrIO._ZarrIO__get_store_path(store) - expected_path = os.path.abspath('test_io.zarr') + expected_path = os.path.abspath("test_io.zarr") self.assertEqual(path, expected_path) def test_force_open_without_consolidated(self): """Test that read-mode -r forces a regular read with mode r""" self.create_zarr(consolidate_metadata=True) # Confirm that opening the file 'r' mode indeed uses the consolidated metadata - with ZarrIO(self.store_path, mode='r') as read_io: + with ZarrIO(self.store_path, mode="r") as read_io: read_io.open() self.assertIsInstance(read_io.file.store, zarr.storage.ConsolidatedMetadataStore) # Confirm that opening the file IN 'r-' mode indeed forces a regular open without consolidated metadata - with ZarrIO(self.store_path, mode='r-') as read_io: + with ZarrIO(self.store_path, mode="r-") as read_io: read_io.open() self.assertIsInstance(read_io.file.store, zarr.storage.DirectoryStore) @@ -161,17 +174,18 @@ def test_force_open_without_consolidated_fails(self): is used to force read without consolidated metadata. """ self.create_zarr(consolidate_metadata=True) - with ZarrIO(self.store_path, mode='r') as read_io: + with ZarrIO(self.store_path, mode="r") as read_io: # Check that using 'r-' fails - msg = 'Mode r- not allowed for reading with consolidated metadata' + msg = "Mode r- not allowed for reading with consolidated metadata" with self.assertRaisesWith(ValueError, msg): - read_io._ZarrIO__open_file_consolidated(store=self.store_path, mode='r-') + read_io._ZarrIO__open_file_consolidated(store=self.store_path, mode="r-") # Check that using 'r' does not fail try: - read_io._ZarrIO__open_file_consolidated(store=self.store_path, mode='r') + read_io._ZarrIO__open_file_consolidated(store=self.store_path, mode="r") except ValueError as e: self.fail("ZarrIO.__open_file_consolidated raised an unexpected ValueError: {}".format(e)) + class TestOverwriteExistingFile(ZarrStoreTestCase): def test_force_overwrite_when_file_exists(self): """ @@ -183,7 +197,7 @@ def test_force_overwrite_when_file_exists(self): file.write("Just a test file used in TestOverwriteExistingFile") # try to create a Zarr file at the same location (i.e., self.store) as the # test text file to force overwriting the existing file. - self.create_zarr(force_overwrite=True, mode='w') + self.create_zarr(force_overwrite=True, mode="w") def test_force_overwrite_when_dir_exists(self): """ @@ -193,7 +207,7 @@ def test_force_overwrite_when_dir_exists(self): # create a Zarr file self.create_zarr() # try to overwrite the existing Zarr file - self.create_zarr(force_overwrite=True, mode='w') + self.create_zarr(force_overwrite=True, mode="w") class TestDimensionLabels(BuildDatasetShapeMixin): @@ -205,33 +219,34 @@ class TestDimensionLabels(BuildDatasetShapeMixin): ii) Create and write a BarDataHolder with a BarData. iii) Read and check that the _ARRAY_DIMENSIONS attribute is set. """ + def tearDown(self): shutil.rmtree(self.store) def get_base_shape_dims(self): - return [None, None], ['a', 'b'] + return [None, None], ["a", "b"] def get_dataset_inc_spec(self): dataset_inc_spec = DatasetSpec( - doc='A BarData', - data_type_inc='BarData', - quantity='*', + doc="A BarData", + data_type_inc="BarData", + quantity="*", ) return dataset_inc_spec def test_build(self): - bar_data_inst = BarData(name='my_bar', data=[[1, 2, 3], [4, 5, 6]], attr1='a string') + bar_data_inst = BarData(name="my_bar", data=[[1, 2, 3], [4, 5, 6]], attr1="a string") bar_data_holder_inst = BarDataHolder( - name='my_bar_holder', + name="my_bar_holder", bar_datas=[bar_data_inst], ) - with ZarrIO(self.store, manager=self.manager, mode='w') as io: + with ZarrIO(self.store, manager=self.manager, mode="w") as io: io.write(bar_data_holder_inst) - with ZarrIO(self.store, manager=self.manager, mode='r') as io: + with ZarrIO(self.store, manager=self.manager, mode="r") as io: file = io.read() - self.assertEqual(file.bar_datas[0].data.attrs['_ARRAY_DIMENSIONS'], ['a', 'b']) + self.assertEqual(file.bar_datas[0].data.attrs["_ARRAY_DIMENSIONS"], ["a", "b"]) class TestDatasetofReferences(ZarrStoreTestCase): @@ -243,7 +258,7 @@ def tearDown(self): """ Remove all files and folders defined by self.store_path """ - paths = self.store_path if isinstance(self.store_path, list) else [self.store_path, ] + paths = self.store_path if isinstance(self.store_path, list) else [self.store_path] for path in paths: if os.path.exists(path): if os.path.isdir(path): @@ -258,17 +273,17 @@ def test_append_references(self): num_bazs = 10 bazs = [] # set up dataset of references for i in range(num_bazs): - bazs.append(Baz(name='baz%d' % i)) - baz_data = BazData(name='baz_data', data=bazs) + bazs.append(Baz(name="baz%d" % i)) + baz_data = BazData(name="baz_data", data=bazs) container = BazBucket(bazs=bazs, baz_data=baz_data) manager = get_baz_buildmanager() - with ZarrIO(self.store, manager=manager, mode='w') as writer: + with ZarrIO(self.store, manager=manager, mode="w") as writer: writer.write(container=container) - with ZarrIO(self.store, manager=manager, mode='a') as append_io: + with ZarrIO(self.store, manager=manager, mode="a") as append_io: read_container = append_io.read() - new_baz = Baz(name='new') + new_baz = Baz(name="new") read_container.add_baz(new_baz) DoR = read_container.baz_data.data @@ -276,7 +291,7 @@ def test_append_references(self): append_io.write(read_container) - with ZarrIO(self.store, manager=manager, mode='r') as append_io: + with ZarrIO(self.store, manager=manager, mode="r") as append_io: read_container = append_io.read() self.assertEqual(len(read_container.baz_data.data), 11) self.assertIs(read_container.baz_data.data[10], read_container.bazs["new"]) diff --git a/tests/unit/utils.py b/tests/unit/utils.py index de343acd..faffbb4f 100644 --- a/tests/unit/utils.py +++ b/tests/unit/utils.py @@ -3,17 +3,27 @@ from copy import copy, deepcopy from abc import ABCMeta, abstractmethod -from hdmf.build import (ObjectMapper, TypeMap, BuildManager) -from hdmf.container import (Container, Data) -from hdmf.spec import (GroupSpec, DatasetSpec, AttributeSpec, LinkSpec, - RefSpec, DtypeSpec, NamespaceCatalog, SpecCatalog, - SpecNamespace, NamespaceBuilder, Spec) -from hdmf.spec.spec import (ZERO_OR_MANY, ONE_OR_MANY, ZERO_OR_ONE) -from hdmf.utils import (docval, getargs, get_docval) +from hdmf.build import ObjectMapper, TypeMap, BuildManager +from hdmf.container import Container, Data +from hdmf.spec import ( + GroupSpec, + DatasetSpec, + AttributeSpec, + LinkSpec, + RefSpec, + DtypeSpec, + NamespaceCatalog, + SpecCatalog, + SpecNamespace, + NamespaceBuilder, + Spec, +) +from hdmf.spec.spec import ZERO_OR_MANY, ONE_OR_MANY, ZERO_OR_ONE +from hdmf.utils import docval, getargs, get_docval from hdmf.testing import TestCase from hdmf_zarr.backend import ROOT_NAME -CORE_NAMESPACE = 'test_core' +CORE_NAMESPACE = "test_core" class CacheSpecTestHelper(object): @@ -23,8 +33,8 @@ def get_types(catalog): types = set() for ns_name in catalog.namespaces: ns = catalog.get_namespace(ns_name) - for source in ns['schema']: - types.update(catalog.get_types(source['source'])) + for source in ns["schema"]: + types.update(catalog.get_types(source["source"])) return types @@ -39,8 +49,9 @@ def get_temp_filepath(): def check_s3fs_ffspec_installed(): """Check if s3fs and ffspec are installed required for streaming access from S3""" try: - import s3fs # noqa F401 + import s3fs # noqa F401 import fsspec # noqa F401 + return True except ImportError: return False @@ -51,13 +62,15 @@ def check_s3fs_ffspec_installed(): ########################################### class Foo(Container): - @docval({'name': 'name', 'type': str, 'doc': 'the name of this Foo'}, - {'name': 'my_data', 'type': ('array_data', 'data'), 'doc': 'some data'}, - {'name': 'attr1', 'type': str, 'doc': 'an attribute'}, - {'name': 'attr2', 'type': int, 'doc': 'another attribute'}, - {'name': 'attr3', 'type': float, 'doc': 'a third attribute', 'default': 3.14}) + @docval( + {"name": "name", "type": str, "doc": "the name of this Foo"}, + {"name": "my_data", "type": ("array_data", "data"), "doc": "some data"}, + {"name": "attr1", "type": str, "doc": "an attribute"}, + {"name": "attr2", "type": int, "doc": "another attribute"}, + {"name": "attr3", "type": float, "doc": "a third attribute", "default": 3.14}, + ) def __init__(self, **kwargs): - name, my_data, attr1, attr2, attr3 = getargs('name', 'my_data', 'attr1', 'attr2', 'attr3', kwargs) + name, my_data, attr1, attr2, attr3 = getargs("name", "my_data", "attr1", "attr2", "attr3", kwargs) super().__init__(name=name) self.__data = my_data self.__attr1 = attr1 @@ -65,12 +78,12 @@ def __init__(self, **kwargs): self.__attr3 = attr3 def __eq__(self, other): - attrs = ('name', 'my_data', 'attr1', 'attr2', 'attr3') + attrs = ("name", "my_data", "attr1", "attr2", "attr3") return all(getattr(self, a) == getattr(other, a) for a in attrs) def __str__(self): - attrs = ('name', 'my_data', 'attr1', 'attr2', 'attr3') - return '<' + ','.join('%s=%s' % (a, getattr(self, a)) for a in attrs) + '>' + attrs = ("name", "my_data", "attr1", "attr2", "attr3") + return "<" + ",".join("%s=%s" % (a, getattr(self, a)) for a in attrs) + ">" @property def my_data(self): @@ -94,10 +107,12 @@ def __hash__(self): class FooBucket(Container): - @docval({'name': 'name', 'type': str, 'doc': 'the name of this bucket'}, - {'name': 'foos', 'type': list, 'doc': 'the Foo objects in this bucket', 'default': list()}) + @docval( + {"name": "name", "type": str, "doc": "the name of this bucket"}, + {"name": "foos", "type": list, "doc": "the Foo objects in this bucket", "default": list()}, + ) def __init__(self, **kwargs): - name, foos = getargs('name', 'foos', kwargs) + name, foos = getargs("name", "foos", kwargs) super().__init__(name=name) self.__foos = {f.name: f for f in foos} # note: collections of groups are unordered in HDF5 for f in foos: @@ -107,7 +122,7 @@ def __eq__(self, other): return self.name == other.name and self.foos == other.foos def __str__(self): - return 'name=%s, foos=%s' % (self.name, self.foos) + return "name=%s, foos=%s" % (self.name, self.foos) @property def foos(self): @@ -126,14 +141,16 @@ class FooFile(Container): and should be reset to 'root' when use is finished to avoid potential cross-talk between tests. """ - @docval({'name': 'buckets', 'type': list, 'doc': 'the FooBuckets in this file', 'default': list()}, - {'name': 'foo_link', 'type': Foo, 'doc': 'an optional linked Foo', 'default': None}, - {'name': 'foofile_data', 'type': 'array_data', 'doc': 'an optional dataset', 'default': None}, - {'name': 'foo_ref_attr', 'type': Foo, 'doc': 'a reference Foo', 'default': None}, - ) + @docval( + {"name": "buckets", "type": list, "doc": "the FooBuckets in this file", "default": list()}, + {"name": "foo_link", "type": Foo, "doc": "an optional linked Foo", "default": None}, + {"name": "foofile_data", "type": "array_data", "doc": "an optional dataset", "default": None}, + {"name": "foo_ref_attr", "type": Foo, "doc": "a reference Foo", "default": None}, + ) def __init__(self, **kwargs): - buckets, foo_link, foofile_data, foo_ref_attr = getargs('buckets', 'foo_link', 'foofile_data', - 'foo_ref_attr', kwargs) + buckets, foo_link, foofile_data, foo_ref_attr = getargs( + "buckets", "foo_link", "foofile_data", "foo_ref_attr", kwargs + ) super().__init__(name=ROOT_NAME) # name is not used - FooFile should be the root container self.__buckets = {b.name: b for b in buckets} # note: collections of groups are unordered in HDF5 for f in buckets: @@ -143,12 +160,14 @@ def __init__(self, **kwargs): self.__foo_ref_attr = foo_ref_attr def __eq__(self, other): - return (self.buckets == other.buckets - and self.foo_link == other.foo_link - and self.foofile_data == other.foofile_data) + return ( + self.buckets == other.buckets + and self.foo_link == other.foo_link + and self.foofile_data == other.foofile_data + ) def __str__(self): - return ('buckets=%s, foo_link=%s, foofile_data=%s' % (self.buckets, self.foo_link, self.foofile_data)) + return "buckets=%s, foo_link=%s, foofile_data=%s" % (self.buckets, self.foo_link, self.foofile_data) @property def buckets(self): @@ -204,91 +223,93 @@ def get_foo_buildmanager(): :return: """ - foo_spec = GroupSpec('A test group specification with a data type', - data_type_def='Foo', - datasets=[DatasetSpec('an example dataset', - 'int', - name='my_data', - attributes=[AttributeSpec('attr2', - 'an example integer attribute', - 'int')])], - attributes=[AttributeSpec('attr1', 'an example string attribute', 'text'), - AttributeSpec('attr3', 'an example float attribute', 'float')]) - - tmp_spec = GroupSpec('A subgroup for Foos', - name='foo_holder', - groups=[GroupSpec('the Foos in this bucket', data_type_inc='Foo', quantity=ZERO_OR_MANY)]) - - bucket_spec = GroupSpec('A test group specification for a data type containing data type', - data_type_def='FooBucket', - groups=[tmp_spec]) + foo_spec = GroupSpec( + "A test group specification with a data type", + data_type_def="Foo", + datasets=[ + DatasetSpec( + "an example dataset", + "int", + name="my_data", + attributes=[AttributeSpec("attr2", "an example integer attribute", "int")], + ) + ], + attributes=[ + AttributeSpec("attr1", "an example string attribute", "text"), + AttributeSpec("attr3", "an example float attribute", "float"), + ], + ) + + tmp_spec = GroupSpec( + "A subgroup for Foos", + name="foo_holder", + groups=[GroupSpec("the Foos in this bucket", data_type_inc="Foo", quantity=ZERO_OR_MANY)], + ) + + bucket_spec = GroupSpec( + "A test group specification for a data type containing data type", data_type_def="FooBucket", groups=[tmp_spec] + ) class FooMapper(ObjectMapper): def __init__(self, spec): super().__init__(spec) - my_data_spec = spec.get_dataset('my_data') - self.map_spec('attr2', my_data_spec.get_attribute('attr2')) + my_data_spec = spec.get_dataset("my_data") + self.map_spec("attr2", my_data_spec.get_attribute("attr2")) class BucketMapper(ObjectMapper): def __init__(self, spec): super().__init__(spec) - foo_holder_spec = spec.get_group('foo_holder') + foo_holder_spec = spec.get_group("foo_holder") self.unmap(foo_holder_spec) - foo_spec = foo_holder_spec.get_data_type('Foo') - self.map_spec('foos', foo_spec) - - file_links_spec = GroupSpec('Foo link group', - name='links', - links=[LinkSpec('Foo link', - name='foo_link', - target_type='Foo', - quantity=ZERO_OR_ONE)] - ) - - file_spec = GroupSpec("A file of Foos contained in FooBuckets", - data_type_def='FooFile', - groups=[GroupSpec('Holds the FooBuckets', - name='buckets', - groups=[GroupSpec("One or more FooBuckets", - data_type_inc='FooBucket', - quantity=ZERO_OR_MANY)]), - file_links_spec], - datasets=[DatasetSpec('Foo data', - name='foofile_data', - dtype='int', - quantity=ZERO_OR_ONE)], - attributes=[AttributeSpec(doc='Foo ref attr', - name='foo_ref_attr', - dtype=RefSpec('Foo', 'object'), - required=False)], - ) + foo_spec = foo_holder_spec.get_data_type("Foo") + self.map_spec("foos", foo_spec) + + file_links_spec = GroupSpec( + "Foo link group", + name="links", + links=[LinkSpec("Foo link", name="foo_link", target_type="Foo", quantity=ZERO_OR_ONE)], + ) + + file_spec = GroupSpec( + "A file of Foos contained in FooBuckets", + data_type_def="FooFile", + groups=[ + GroupSpec( + "Holds the FooBuckets", + name="buckets", + groups=[GroupSpec("One or more FooBuckets", data_type_inc="FooBucket", quantity=ZERO_OR_MANY)], + ), + file_links_spec, + ], + datasets=[DatasetSpec("Foo data", name="foofile_data", dtype="int", quantity=ZERO_OR_ONE)], + attributes=[ + AttributeSpec(doc="Foo ref attr", name="foo_ref_attr", dtype=RefSpec("Foo", "object"), required=False) + ], + ) class FileMapper(ObjectMapper): def __init__(self, spec): super().__init__(spec) - bucket_spec = spec.get_group('buckets').get_data_type('FooBucket') - self.map_spec('buckets', bucket_spec) - self.unmap(spec.get_group('links')) - foo_link_spec = spec.get_group('links').get_link('foo_link') - self.map_spec('foo_link', foo_link_spec) + bucket_spec = spec.get_group("buckets").get_data_type("FooBucket") + self.map_spec("buckets", bucket_spec) + self.unmap(spec.get_group("links")) + foo_link_spec = spec.get_group("links").get_link("foo_link") + self.map_spec("foo_link", foo_link_spec) spec_catalog = SpecCatalog() - spec_catalog.register_spec(foo_spec, 'test.yaml') - spec_catalog.register_spec(bucket_spec, 'test.yaml') - spec_catalog.register_spec(file_spec, 'test.yaml') + spec_catalog.register_spec(foo_spec, "test.yaml") + spec_catalog.register_spec(bucket_spec, "test.yaml") + spec_catalog.register_spec(file_spec, "test.yaml") namespace = SpecNamespace( - 'a test namespace', - CORE_NAMESPACE, - [{'source': 'test.yaml'}], - version='0.1.0', - catalog=spec_catalog) + "a test namespace", CORE_NAMESPACE, [{"source": "test.yaml"}], version="0.1.0", catalog=spec_catalog + ) namespace_catalog = NamespaceCatalog() namespace_catalog.add_namespace(CORE_NAMESPACE, namespace) type_map = TypeMap(namespace_catalog) - type_map.register_container_type(CORE_NAMESPACE, 'Foo', Foo) - type_map.register_container_type(CORE_NAMESPACE, 'FooBucket', FooBucket) - type_map.register_container_type(CORE_NAMESPACE, 'FooFile', FooFile) + type_map.register_container_type(CORE_NAMESPACE, "Foo", Foo) + type_map.register_container_type(CORE_NAMESPACE, "FooBucket", FooBucket) + type_map.register_container_type(CORE_NAMESPACE, "FooFile", FooFile) type_map.register_map(Foo, FooMapper) type_map.register_map(FooBucket, BucketMapper) @@ -317,12 +338,14 @@ class BazCpdData(Data): class BazBucket(Container): - @docval({'name': 'name', 'type': str, 'doc': 'the name of this bucket', 'default': ROOT_NAME}, - {'name': 'bazs', 'type': list, 'doc': 'the Baz objects in this bucket'}, - {'name': 'baz_data', 'type': BazData, 'doc': 'dataset of Baz references', 'default': None}, - {'name': 'baz_cpd_data', 'type': BazCpdData, 'doc': 'dataset of Baz references', 'default': None}) + @docval( + {"name": "name", "type": str, "doc": "the name of this bucket", "default": ROOT_NAME}, + {"name": "bazs", "type": list, "doc": "the Baz objects in this bucket"}, + {"name": "baz_data", "type": BazData, "doc": "dataset of Baz references", "default": None}, + {"name": "baz_cpd_data", "type": BazCpdData, "doc": "dataset of Baz references", "default": None}, + ) def __init__(self, **kwargs): - name, bazs, baz_data, baz_cpd_data = getargs('name', 'bazs', 'baz_data', 'baz_cpd_data', kwargs) + name, bazs, baz_data, baz_cpd_data = getargs("name", "bazs", "baz_data", "baz_cpd_data", kwargs) super().__init__(name=name) self.__bazs = {b.name: b for b in bazs} # note: collections of groups are unordered in HDF5 for b in bazs: @@ -358,70 +381,75 @@ def remove_baz(self, baz_name): def get_baz_buildmanager(): baz_spec = GroupSpec( - doc='A test group specification with a data type', - data_type_def='Baz', + doc="A test group specification with a data type", + data_type_def="Baz", ) baz_data_spec = DatasetSpec( - doc='A test dataset of references specification with a data type', - name='baz_data', - data_type_def='BazData', - dtype=RefSpec('Baz', 'object'), + doc="A test dataset of references specification with a data type", + name="baz_data", + data_type_def="BazData", + dtype=RefSpec("Baz", "object"), shape=[None], ) baz_cpd_data_spec = DatasetSpec( - doc='A test compound dataset with references specification with a data type', - name='baz_cpd_data', - data_type_def='BazCpdData', - dtype=[DtypeSpec(name='part1', doc='doc', dtype='int'), - DtypeSpec(name='part2', doc='doc', dtype=RefSpec('Baz', 'object'))], + doc="A test compound dataset with references specification with a data type", + name="baz_cpd_data", + data_type_def="BazCpdData", + dtype=[ + DtypeSpec(name="part1", doc="doc", dtype="int"), + DtypeSpec(name="part2", doc="doc", dtype=RefSpec("Baz", "object")), + ], shape=[None], ) baz_holder_spec = GroupSpec( - doc='group of bazs', - name='bazs', - groups=[GroupSpec(doc='Baz', data_type_inc='Baz', quantity=ONE_OR_MANY)], + doc="group of bazs", + name="bazs", + groups=[GroupSpec(doc="Baz", data_type_inc="Baz", quantity=ONE_OR_MANY)], ) baz_bucket_spec = GroupSpec( - doc='A test group specification for a data type containing data type', - data_type_def='BazBucket', + doc="A test group specification for a data type containing data type", + data_type_def="BazBucket", groups=[baz_holder_spec], - datasets=[DatasetSpec(doc='doc', data_type_inc='BazData', quantity=ZERO_OR_ONE), - DatasetSpec(doc='doc', data_type_inc='BazCpdData', quantity=ZERO_OR_ONE)], + datasets=[ + DatasetSpec(doc="doc", data_type_inc="BazData", quantity=ZERO_OR_ONE), + DatasetSpec(doc="doc", data_type_inc="BazCpdData", quantity=ZERO_OR_ONE), + ], ) spec_catalog = SpecCatalog() - spec_catalog.register_spec(baz_spec, 'test.yaml') - spec_catalog.register_spec(baz_data_spec, 'test.yaml') - spec_catalog.register_spec(baz_cpd_data_spec, 'test.yaml') - spec_catalog.register_spec(baz_bucket_spec, 'test.yaml') + spec_catalog.register_spec(baz_spec, "test.yaml") + spec_catalog.register_spec(baz_data_spec, "test.yaml") + spec_catalog.register_spec(baz_cpd_data_spec, "test.yaml") + spec_catalog.register_spec(baz_bucket_spec, "test.yaml") namespace = SpecNamespace( - 'a test namespace', + "a test namespace", CORE_NAMESPACE, - [{'source': 'test.yaml'}], - version='0.1.0', - catalog=spec_catalog) + [{"source": "test.yaml"}], + version="0.1.0", + catalog=spec_catalog, + ) namespace_catalog = NamespaceCatalog() namespace_catalog.add_namespace(CORE_NAMESPACE, namespace) type_map = TypeMap(namespace_catalog) - type_map.register_container_type(CORE_NAMESPACE, 'Baz', Baz) - type_map.register_container_type(CORE_NAMESPACE, 'BazData', BazData) - type_map.register_container_type(CORE_NAMESPACE, 'BazCpdData', BazCpdData) - type_map.register_container_type(CORE_NAMESPACE, 'BazBucket', BazBucket) + type_map.register_container_type(CORE_NAMESPACE, "Baz", Baz) + type_map.register_container_type(CORE_NAMESPACE, "BazData", BazData) + type_map.register_container_type(CORE_NAMESPACE, "BazCpdData", BazCpdData) + type_map.register_container_type(CORE_NAMESPACE, "BazBucket", BazBucket) class BazBucketMapper(ObjectMapper): def __init__(self, spec): super().__init__(spec) - baz_holder_spec = spec.get_group('bazs') + baz_holder_spec = spec.get_group("bazs") self.unmap(baz_holder_spec) - baz_spec = baz_holder_spec.get_data_type('Baz') - self.map_spec('bazs', baz_spec) + baz_spec = baz_holder_spec.get_data_type("Baz") + self.map_spec("bazs", baz_spec) type_map.register_map(BazBucket, BazBucketMapper) @@ -438,15 +466,15 @@ def create_test_type_map(specs, container_classes, mappers=None): :return: the constructed TypeMap """ spec_catalog = SpecCatalog() - schema_file = 'test.yaml' + schema_file = "test.yaml" for s in specs: spec_catalog.register_spec(s, schema_file) namespace = SpecNamespace( - doc='a test namespace', + doc="a test namespace", name=CORE_NAMESPACE, - schema=[{'source': schema_file}], - version='0.1.0', - catalog=spec_catalog + schema=[{"source": schema_file}], + version="0.1.0", + catalog=spec_catalog, ) namespace_catalog = NamespaceCatalog() namespace_catalog.add_namespace(CORE_NAMESPACE, namespace) @@ -475,11 +503,11 @@ def create_load_namespace_yaml(namespace_name, specs, output_dir, incl_types, ty """ ns_builder = NamespaceBuilder( name=namespace_name, - doc='a test namespace', - version='0.1.0', + doc="a test namespace", + version="0.1.0", ) - ns_filename = ns_builder.name + '.namespace.yaml' - ext_filename = ns_builder.name + '.extensions.yaml' + ns_filename = ns_builder.name + ".namespace.yaml" + ext_filename = ns_builder.name + ".extensions.yaml" for ns, types in incl_types.items(): if types is None: # include all types @@ -498,39 +526,52 @@ def create_load_namespace_yaml(namespace_name, specs, output_dir, incl_types, ty # ##### custom spec classes ##### + def swap_inc_def(cls, custom_cls): args = get_docval(cls.__init__) ret = list() for arg in args: - if arg['name'] == 'data_type_def': - ret.append({'name': 'my_data_type_def', 'type': str, - 'doc': 'the NWB data type this spec defines', 'default': None}) - elif arg['name'] == 'data_type_inc': - ret.append({'name': 'my_data_type_inc', 'type': (custom_cls, str), - 'doc': 'the NWB data type this spec includes', 'default': None}) + if arg["name"] == "data_type_def": + ret.append( + { + "name": "my_data_type_def", + "type": str, + "doc": "the NWB data type this spec defines", + "default": None, + } + ) + elif arg["name"] == "data_type_inc": + ret.append( + { + "name": "my_data_type_inc", + "type": (custom_cls, str), + "doc": "the NWB data type this spec includes", + "default": None, + } + ) else: ret.append(copy(arg)) return ret class BaseStorageOverride: - __type_key = 'my_data_type' - __inc_key = 'my_data_type_inc' - __def_key = 'my_data_type_def' + __type_key = "my_data_type" + __inc_key = "my_data_type_inc" + __def_key = "my_data_type_def" @classmethod def type_key(cls): - ''' Get the key used to store data type on an instance''' + """Get the key used to store data type on an instance""" return cls.__type_key @classmethod def inc_key(cls): - ''' Get the key used to define a data_type include.''' + """Get the key used to define a data_type include.""" return cls.__inc_key @classmethod def def_key(cls): - ''' Get the key used to define a data_type definition.''' + """Get the key used to define a data_type definition.""" return cls.__def_key @classmethod @@ -556,7 +597,7 @@ def _translate_kwargs(cls, kwargs): class CustomGroupSpec(BaseStorageOverride, GroupSpec): - @docval(*deepcopy(swap_inc_def(GroupSpec, 'CustomGroupSpec'))) + @docval(*deepcopy(swap_inc_def(GroupSpec, "CustomGroupSpec"))) def __init__(self, **kwargs): kwargs = self._translate_kwargs(kwargs) super().__init__(**kwargs) @@ -565,15 +606,15 @@ def __init__(self, **kwargs): def dataset_spec_cls(cls): return CustomDatasetSpec - @docval(*deepcopy(swap_inc_def(GroupSpec, 'CustomGroupSpec'))) + @docval(*deepcopy(swap_inc_def(GroupSpec, "CustomGroupSpec"))) def add_group(self, **kwargs): spec = CustomGroupSpec(**kwargs) self.set_group(spec) return spec - @docval(*deepcopy(swap_inc_def(DatasetSpec, 'CustomDatasetSpec'))) + @docval(*deepcopy(swap_inc_def(DatasetSpec, "CustomDatasetSpec"))) def add_dataset(self, **kwargs): - ''' Add a new specification for a subgroup to this group specification ''' + """Add a new specification for a subgroup to this group specification""" spec = CustomDatasetSpec(**kwargs) self.set_dataset(spec) return spec @@ -581,14 +622,14 @@ def add_dataset(self, **kwargs): class CustomDatasetSpec(BaseStorageOverride, DatasetSpec): - @docval(*deepcopy(swap_inc_def(DatasetSpec, 'CustomDatasetSpec'))) + @docval(*deepcopy(swap_inc_def(DatasetSpec, "CustomDatasetSpec"))) def __init__(self, **kwargs): kwargs = self._translate_kwargs(kwargs) super().__init__(**kwargs) class CustomSpecNamespace(SpecNamespace): - __types_key = 'my_data_types' + __types_key = "my_data_types" @classmethod def types_key(cls): @@ -597,21 +638,23 @@ def types_key(cls): class BarData(Data): - @docval({'name': 'name', 'type': str, 'doc': 'the name of this BarData'}, - {'name': 'data', 'type': ('data', 'array_data'), 'doc': 'the data'}, - {'name': 'attr1', 'type': str, 'doc': 'a string attribute', 'default': None}, - {'name': 'attr2', 'type': 'int', 'doc': 'an int attribute', 'default': None}, - {'name': 'ext_attr', 'type': bool, 'doc': 'a boolean attribute', 'default': True}) + @docval( + {"name": "name", "type": str, "doc": "the name of this BarData"}, + {"name": "data", "type": ("data", "array_data"), "doc": "the data"}, + {"name": "attr1", "type": str, "doc": "a string attribute", "default": None}, + {"name": "attr2", "type": "int", "doc": "an int attribute", "default": None}, + {"name": "ext_attr", "type": bool, "doc": "a boolean attribute", "default": True}, + ) def __init__(self, **kwargs): - name, data, attr1, attr2, ext_attr = getargs('name', 'data', 'attr1', 'attr2', 'ext_attr', kwargs) + name, data, attr1, attr2, ext_attr = getargs("name", "data", "attr1", "attr2", "ext_attr", kwargs) super().__init__(name=name, data=data) self.__attr1 = attr1 self.__attr2 = attr2 - self.__ext_attr = kwargs['ext_attr'] + self.__ext_attr = kwargs["ext_attr"] @property def data_type(self): - return 'BarData' + return "BarData" @property def attr1(self): @@ -628,10 +671,12 @@ def ext_attr(self): class BarDataHolder(Container): - @docval({'name': 'name', 'type': str, 'doc': 'the name of this BarDataHolder'}, - {'name': 'bar_datas', 'type': ('data', 'array_data'), 'doc': 'bar_datas', 'default': list()}) + @docval( + {"name": "name", "type": str, "doc": "the name of this BarDataHolder"}, + {"name": "bar_datas", "type": ("data", "array_data"), "doc": "bar_datas", "default": list()}, + ) def __init__(self, **kwargs): - name, bar_datas = getargs('name', 'bar_datas', kwargs) + name, bar_datas = getargs("name", "bar_datas", kwargs) super().__init__(name=name) self.__bar_datas = bar_datas for b in bar_datas: @@ -640,7 +685,7 @@ def __init__(self, **kwargs): @property def data_type(self): - return 'BarDataHolder' + return "BarDataHolder" @property def bar_datas(self): @@ -649,17 +694,19 @@ def bar_datas(self): class ExtBarDataMapper(ObjectMapper): - @docval({"name": "spec", "type": Spec, "doc": "the spec to get the attribute value for"}, - {"name": "container", "type": BarData, "doc": "the container to get the attribute value from"}, - {"name": "manager", "type": BuildManager, "doc": "the BuildManager used for managing this build"}, - returns='the value of the attribute') + @docval( + {"name": "spec", "type": Spec, "doc": "the spec to get the attribute value for"}, + {"name": "container", "type": BarData, "doc": "the container to get the attribute value from"}, + {"name": "manager", "type": BuildManager, "doc": "the BuildManager used for managing this build"}, + returns="the value of the attribute", + ) def get_attr_value(self, **kwargs): - ''' Get the value of the attribute corresponding to this spec from the given container ''' - spec, container, manager = getargs('spec', 'container', 'manager', kwargs) + """Get the value of the attribute corresponding to this spec from the given container""" + spec, container, manager = getargs("spec", "container", "manager", kwargs) # handle custom mapping of field 'ext_attr' within container # BardataHolder/BarData -> spec BarDataHolder/BarData.ext_attr if isinstance(container.parent, BarDataHolder): - if spec.name == 'ext_attr': + if spec.name == "ext_attr": return container.ext_attr return super().get_attr_value(**kwargs) @@ -670,20 +717,20 @@ def setUp(self): self.store = "tests/unit/test_io.zarr" self.set_up_specs() spec_catalog = SpecCatalog() - spec_catalog.register_spec(self.bar_data_spec, 'test.yaml') - spec_catalog.register_spec(self.bar_data_holder_spec, 'test.yaml') + spec_catalog.register_spec(self.bar_data_spec, "test.yaml") + spec_catalog.register_spec(self.bar_data_holder_spec, "test.yaml") namespace = SpecNamespace( - doc='a test namespace', + doc="a test namespace", name=CORE_NAMESPACE, - schema=[{'source': 'test.yaml'}], - version='0.1.0', - catalog=spec_catalog + schema=[{"source": "test.yaml"}], + version="0.1.0", + catalog=spec_catalog, ) namespace_catalog = NamespaceCatalog() namespace_catalog.add_namespace(CORE_NAMESPACE, namespace) type_map = TypeMap(namespace_catalog) - type_map.register_container_type(CORE_NAMESPACE, 'BarData', BarData) - type_map.register_container_type(CORE_NAMESPACE, 'BarDataHolder', BarDataHolder) + type_map.register_container_type(CORE_NAMESPACE, "BarData", BarData) + type_map.register_container_type(CORE_NAMESPACE, "BarDataHolder", BarDataHolder) type_map.register_map(BarData, ExtBarDataMapper) type_map.register_map(BarDataHolder, ObjectMapper) self.manager = BuildManager(type_map) @@ -691,15 +738,15 @@ def setUp(self): def set_up_specs(self): shape, dims = self.get_base_shape_dims() self.bar_data_spec = DatasetSpec( - doc='A test dataset specification with a data type', - data_type_def='BarData', - dtype='int', + doc="A test dataset specification with a data type", + data_type_def="BarData", + dtype="int", shape=shape, dims=dims, ) self.bar_data_holder_spec = GroupSpec( - doc='A container of multiple extended BarData objects', - data_type_def='BarDataHolder', + doc="A container of multiple extended BarData objects", + data_type_def="BarDataHolder", datasets=[self.get_dataset_inc_spec()], )