diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 27acfaf610..818cd0c128 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -1,3 +1,4 @@ +import json import os import deeplake @@ -53,7 +54,7 @@ from deeplake.util.auto import get_most_common_extension from deeplake.util.bugout_reporter import feature_report_path, deeplake_reporter from deeplake.util.delete_entry import remove_path_from_backend -from deeplake.util.keys import dataset_exists +from deeplake.util.keys import dataset_exists, get_dataset_meta_key, FIRST_COMMIT_ID from deeplake.util.exceptions import ( AgreementError, DatasetHandlerError, @@ -237,6 +238,11 @@ def init( if ds_exists: if overwrite: + if not dataset._allow_delete(cache_chain): + raise DatasetHandlerError( + "Dataset overwrite failed. The dataset is marked as delete_allowed=false. To allow overwrite, you must first run `allow_delete(True)` on the dataset." + ) + try: cache_chain.clear() except Exception as e: @@ -457,6 +463,11 @@ def empty( raise if overwrite and dataset_exists(cache_chain): + if not dataset._allow_delete(cache_chain): + raise DatasetHandlerError( + "Dataset overwrite failed. The dataset is marked as delete_allowed=false. To allow overwrite, you must first run `allow_delete(True)` on the dataset." + ) + try: cache_chain.clear() except Exception as e: @@ -866,6 +877,7 @@ def delete( ds = deeplake.load(path, verbose=False, token=token, creds=creds) except UserNotLoggedInException: raise UserNotLoggedInException from None + ds.delete(large_ok=large_ok) if verbose: logger.info(f"{path} dataset deleted successfully.") @@ -1275,6 +1287,11 @@ def deepcopy( if dataset_exists(cache_chain): if overwrite: + if not dataset._allow_delete(cache_chain): + raise DatasetHandlerError( + "Dataset overwrite failed. The dataset is marked as delete_allowed=false. To allow overwrite, you must first run `allow_delete(True)` on the dataset." + ) + try: cache_chain.clear() except Exception as e: @@ -2062,3 +2079,12 @@ def query(query_string: str, token: Optional[str] = "") -> Dataset: from deeplake.enterprise.libdeeplake_query import universal_query return universal_query(query_string=query_string, token=token) + + @staticmethod + def _allow_delete(storage, commit_id=None) -> bool: + meta = json.loads( + storage[get_dataset_meta_key(commit_id or FIRST_COMMIT_ID)].decode("utf-8") + ) + if "allow_delete" in meta and not meta["allow_delete"]: + return False + return True diff --git a/deeplake/api/tests/test_dataset.py b/deeplake/api/tests/test_dataset.py index 6061ad8c6e..b9f920041d 100644 --- a/deeplake/api/tests/test_dataset.py +++ b/deeplake/api/tests/test_dataset.py @@ -74,3 +74,23 @@ def test_persistence_bug(local_ds_generator): ds = local_ds_generator() np.testing.assert_array_equal(ds[tensor_name].numpy(), np.array([[1], [2]])) + + +def test_allow_delete(local_ds_generator, local_path): + ds = local_ds_generator() + assert ds.allow_delete is True + + ds.allow_delete = False + assert ds.allow_delete is False + + ds2 = deeplake.load(ds.path) + assert ds2.allow_delete is False + + with pytest.raises(DatasetHandlerError): + deeplake.empty(ds.path, overwrite=True) + deeplake.deepcopy(src=ds.path, dest=local_path, overwrite=True) + ds.delete() + + ds.allow_delete = True + assert ds.allow_delete is True + ds.delete() diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index fe68190cd5..67bd78fa30 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -111,6 +111,7 @@ BadRequestException, SampleAppendError, SampleExtendError, + DatasetHandlerError, ) from deeplake.util.keys import ( dataset_exists, @@ -2040,6 +2041,16 @@ def _set_read_only(self, value: bool, err: bool): def read_only(self, value: bool): self._set_read_only(value, True) + @property + def allow_delete(self) -> bool: + """Returns True if dataset can be deleted from storage. Whether it can be deleted or not is stored in the database_meta.json and can be changed with allow_delete(boolean)""" + return self.meta.allow_delete + + @allow_delete.setter + def allow_delete(self, value: bool): + self.meta.allow_delete = value + self.flush() + def pytorch( self, transform: Optional[Callable] = None, @@ -2587,12 +2598,18 @@ def delete(self, large_ok=False): Raises: DatasetTooLargeToDelete: If the dataset is larger than 1 GB and ``large_ok`` is ``False``. + DatasetHandlerError: If the dataset is marked as delete_allowed=False. """ deeplake_reporter.feature_report( feature_name="delete", parameters={"large_ok": large_ok} ) + if not self.allow_delete: + raise DatasetHandlerError( + "The dataset is marked as delete_allowed=false. To delete this dataset, you must first run `allow_delete(True)` on the dataset." + ) + if hasattr(self, "_view_entry"): self._view_entry.delete() return @@ -2652,6 +2669,9 @@ def __str__(self): if self.read_only: mode_str = f"read_only=True, " + if not self.allow_delete: + mode_str = f"allow_delete=False, " + index_str = f"index={self.index}, " if self.index.is_trivial(): index_str = "" diff --git a/deeplake/core/meta/dataset_meta.py b/deeplake/core/meta/dataset_meta.py index 8b6f7905c3..da4663ccda 100644 --- a/deeplake/core/meta/dataset_meta.py +++ b/deeplake/core/meta/dataset_meta.py @@ -16,6 +16,7 @@ def __init__(self): self.tensor_names = {} self.hidden_tensors = [] self.default_index = Index().to_json() + self._allow_delete = True @property def visible_tensors(self): @@ -33,6 +34,15 @@ def nbytes(self): # TODO: can optimize this return len(self.tobytes()) + @property + def allow_delete(self): + return self._allow_delete + + @allow_delete.setter + def allow_delete(self, value): + self._allow_delete = value + self.is_dirty = True + def __getstate__(self) -> Dict[str, Any]: d = super().__getstate__() d["tensors"] = self.tensors.copy() @@ -40,9 +50,12 @@ def __getstate__(self) -> Dict[str, Any]: d["tensor_names"] = self.tensor_names.copy() d["hidden_tensors"] = self.hidden_tensors.copy() d["default_index"] = self.default_index.copy() + d["allow_delete"] = self._allow_delete return d def __setstate__(self, d): + if "allow_delete" in d: + d["_allow_delete"] = d.pop("allow_delete") self.__dict__.update(d) def add_tensor(self, name, key, hidden=False):