Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support overwriting an existing file at the same location #229

Merged
merged 10 commits into from
Nov 8, 2024
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
### Enhancements
* Added support for Pathlib paths. @mavaylon1 [#212](https://github.com/hdmf-dev/hdmf-zarr/pull/212)
* Updated packages used for testing and readthedocs configuration. @mavaylon1, @rly [#214](https://github.com/hdmf-dev/hdmf-zarr/pull/214)
* Add `force_overwite` parameter for `ZarrIO.__init__` to allow overwriting an existing file or directory. @oruebel [#229](https://github.com/hdmf-dev/hdmf-zarr/pull/229)

## 0.9.0 (September 16, 2024)
### Enhancements
Expand Down
46 changes: 37 additions & 9 deletions src/hdmf_zarr/backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module with the Zarr-based I/O-backend for HDMF"""
# Python imports
import os
import shutil
import warnings
import numpy as np
import tempfile
Expand Down Expand Up @@ -91,7 +92,7 @@ def can_read(path):
'doc': 'the path to the Zarr file or a supported Zarr store'},
{'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None},
{'name': 'mode', 'type': str,
'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-") '
'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "r-"). '
'the mode r- is used to force open without consolidated metadata in read only mode.'},
{'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool),
'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.',
Expand All @@ -102,11 +103,18 @@ def can_read(path):
'default': None},
{'name': 'storage_options', 'type': dict,
'doc': 'Zarr storage options to read remote folders',
'default': None})
'default': None},
{'name': 'force_overwrite',
'type': bool,
'doc': "force overwriting existing object when in 'w' mode. The existing file or directory"
" will be deleted when before opening (even if the object is not Zarr, e.g,. an HDF5 file)",
'default': False}
)
def __init__(self, **kwargs):
self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
path, manager, mode, synchronizer, object_codec_class, storage_options = popargs(
'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', kwargs)
path, manager, mode, synchronizer, object_codec_class, storage_options, force_overwrite = popargs(
'path', 'manager', 'mode', 'synchronizer', 'object_codec_class',
'storage_options', 'force_overwrite', kwargs)
if manager is None:
manager = BuildManager(TypeMap(NamespaceCatalog()))
if isinstance(synchronizer, bool):
Expand All @@ -118,6 +126,7 @@ def __init__(self, **kwargs):
else:
self.__synchronizer = synchronizer
self.__mode = mode
self.__force_overwrite = force_overwrite
if isinstance(path, Path):
path = str(path)
self.__path = path
Expand Down Expand Up @@ -160,25 +169,44 @@ def synchronizer(self):
def object_codec_class(self):
return self.__codec_cls

@property
def mode(self):
"""
The mode specified by the user when creating the ZarrIO instance.
NOTE: The Zarr library may not honor the mode. E.g., DirectoryStore in Zarr uses
append mode and does not allow setting a file to read-only mode.
"""
return self.__mode

def open(self):
"""Open the Zarr file"""
if self.__file is None:
# Allow overwriting an existing file (e.g., an HDF5 file). Zarr will normally fail if the
# existing object at the path is a file. So if we are in `w` mode we need to delete the file first
if self.mode == 'w' and self.__force_overwrite:
if isinstance(self.path, (str, Path)) and os.path.exists(self.path):
if os.path.isdir(self.path): # directory
shutil.rmtree(self.path)
else: # File
os.remove(self.path)

# Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'.
# As a result, when in other modes, the file will not use consolidated metadata.
if self.__mode != 'r':
if self.mode != 'r':
# When we consolidate metadata, we use ConsolidatedMetadataStore.
# This interface does not allow for setting items.
# In the doc string, it says it is "read only". As a result, we cannot use r+ with consolidate_metadata.
# r- is only an internal mode in ZarrIO to force the use of regular open. For Zarr we need to
# use the regular mode r when r- is specified
mode_to_use = self.__mode if self.__mode != 'r-' else 'r'
mode_to_use = self.mode if self.mode != 'r-' else 'r'
self.__file = zarr.open(store=self.path,
mode=mode_to_use,
synchronizer=self.__synchronizer,
storage_options=self.__storage_options)
else:
self.__file = self.__open_file_consolidated(store=self.path,
mode=self.__mode,
mode=self.mode,
synchronizer=self.__synchronizer,
storage_options=self.__storage_options)

Expand Down Expand Up @@ -343,9 +371,9 @@ def export(self, **kwargs):
"""Export data read from a file from any backend to Zarr.
See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details.
"""
if self.__mode != 'w':
if self.mode != 'w':
raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'."
% (self.source, self.__mode))
% (self.source, self.mode))

src_io = getargs('src_io', kwargs)
write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs)
Expand Down
9 changes: 6 additions & 3 deletions tests/unit/base_tests_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ def setUp(self):

def tearDown(self):
if os.path.exists(self.store):
shutil.rmtree(self.store)
if os.path.isdir(self.store):
shutil.rmtree(self.store)
else: # in case a test created a file instead of a directory
os.remove(self.store)

def createReferenceBuilder(self):
data_1 = np.arange(100, 200, 10).reshape(2, 5)
Expand All @@ -117,9 +120,9 @@ def createReferenceBuilder(self):
'ref_dataset': dataset_ref})
return builder

def create_zarr(self, consolidate_metadata=True):
def create_zarr(self, consolidate_metadata=True, force_overwrite=False, mode='a'):
builder = self.createReferenceBuilder()
writer = ZarrIO(self.store, mode='a')
writer = ZarrIO(self.store, mode=mode, force_overwrite=force_overwrite)
writer.write_builder(builder, consolidate_metadata)
writer.close()

Expand Down
23 changes: 23 additions & 0 deletions tests/unit/test_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,29 @@ def test_force_open_without_consolidated_fails(self):
except ValueError as e:
self.fail("ZarrIO.__open_file_consolidated raised an unexpected ValueError: {}".format(e))

class TestOverwriteExistingFile(ZarrStoreTestCase):
def test_force_overwrite_when_file_exists(self):
"""
Test that we can overwrite a file when opening with `w` mode even if there is
an existing file. Zarr can write into a directory but not a file.
"""
# create a dummy text file
with open(self.store, "w") as file:
file.write("Just a test file used in TestOverwriteExistingFile")
# try to create a Zarr file at the same location (i.e., self.store) as the
# test text file to force overwriting the existing file.
self.create_zarr(force_overwrite=True, mode='w')

def test_force_overwrite_when_dir_exists(self):
"""
Test that we can overwrite a directory when opening with `w` mode even if there is
an existing directory.
"""
# create a Zarr file
self.create_zarr()
# try to overwrite the existing Zarr file
self.create_zarr(force_overwrite=True, mode='w')


class TestDimensionLabels(BuildDatasetShapeMixin):
"""
Expand Down
Loading