From ddd77daf6476107e249e3d27f1f4272d42857799 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 30 Jan 2024 10:18:51 -0500 Subject: [PATCH] Introduce ContentType as Blob vs Zarr and rename blobDateModified to contentDateModified This is just an initial attempt open for discussion. I ran into "blobDateModified" in a zarr metadata and it raised my eyebrow since that is not really appropriate and confusing. Hence I decided to look into generalization. I also thought that it would be valuable to make "type" of the content Asset points to explicit, although that could lead to inconsistencies since information is somewhat redundant with encodingFormat and potentially could also be deduced from contenUrl since we have different end points on S3, etc. Nevertheless I think it might be better to make it explicit. Or at least we have to rename blobDateModified. - ContenType name is quite suboptimal since there is a standard HTTP header https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Type and thus we could potential confusion. But we should keep it a "Type" (not e.g. a Class) to be consistent with other type definitions among models. So the other part we could try to vary is "Content". Possible alternatives are "Object", "Data", "Resource" - ATM we call all Zarrs just Zarr but it is a "ZarrFolder" really. I wonder if it would be time to start to introduce differentiation here by making it "ZarrFolder", as later we might get "ZarrHDF5" or alike --- dandischema/consts.py | 3 ++- dandischema/metadata.py | 12 ++++++++++++ dandischema/models.py | 19 +++++++++++++++++-- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/dandischema/consts.py b/dandischema/consts.py index 802cb9f..e6b1c9e 100644 --- a/dandischema/consts.py +++ b/dandischema/consts.py @@ -1,4 +1,4 @@ -DANDI_SCHEMA_VERSION = "0.6.5" +DANDI_SCHEMA_VERSION = "0.7.0" ALLOWED_INPUT_SCHEMAS = [ "0.4.4", "0.5.1", @@ -8,6 +8,7 @@ "0.6.2", "0.6.3", "0.6.4", + "0.6.5", ] # ATM we allow only for a single target version which is current diff --git a/dandischema/metadata.py b/dandischema/metadata.py index 21999ea..9a8df6c 100644 --- a/dandischema/metadata.py +++ b/dandischema/metadata.py @@ -300,6 +300,18 @@ def migrate( if "schemaKey" not in obj: obj["schemaKey"] = "Dandiset" obj["schemaVersion"] = to_version + if version2tuple(schema_version) < version2tuple("0.7.0"): + if "blobDateModified" in obj: + obj["contentDateModified"] = obj.pop("blobDateModified") + # we need to deduce what type of content we have + obj["contentType"] = ( + models.ContentType.Zarr + if ( + obj.get("encodingFormat") == "application/x-zarr" + or obj.get("path", "").endswith(".zarr") + ) + else models.ContentType.Blob + ) return obj diff --git a/dandischema/models.py b/dandischema/models.py index 7633143..ddf0cb5 100644 --- a/dandischema/models.py +++ b/dandischema/models.py @@ -77,6 +77,16 @@ class AccessType(Enum): """ +class ContentType(Enum): + """An enumeration of types of content asset can have""" + + #: Asset contains a regular file - a data blob + Blob = "dandi:Blob" + + #: Asset contains a zarr (currently a folder) + Zarr = "dandi:Zarr" + + class DigestType(Enum): """An enumeration of checksum types""" @@ -1516,10 +1526,15 @@ class BareAsset(CommonModel): json_schema_extra={"nskey": "schema"}, title="Asset (file or metadata) modification date and time", ) - blobDateModified: Optional[datetime] = Field( + contentType: ContentType = Field( + None, + json_schema_extra={"nskey": "dandi"}, + title="Type of the content asset contains.", + ) + contentDateModified: Optional[datetime] = Field( None, json_schema_extra={"nskey": "dandi"}, - title="Asset file modification date and time.", + title="Asset content modification date and time.", ) # overload to restrict with max_items=1 access: List[AccessRequirements] = Field(