From adef226e49c193492e059f66816cb6981766bb80 Mon Sep 17 00:00:00 2001 From: Ray Plante Date: Sun, 17 May 2020 14:29:57 -0400 Subject: [PATCH] fix ODD-881: bagit.builder.describe_data_file(): by default use previously save metadata as default; added asupdate parameter. --- python/nistoar/pdr/preserv/bagit/builder.py | 38 +++++++++++++++---- .../nistoar/pdr/preserv/bagit/test_builder.py | 34 +++++++++++++++++ 2 files changed, 64 insertions(+), 8 deletions(-) diff --git a/python/nistoar/pdr/preserv/bagit/builder.py b/python/nistoar/pdr/preserv/bagit/builder.py index f74c31b1b..f0a2f13d2 100644 --- a/python/nistoar/pdr/preserv/bagit/builder.py +++ b/python/nistoar/pdr/preserv/bagit/builder.py @@ -431,6 +431,10 @@ def _upd_downloadurl(self, ediid): def _download_url(self, ediid, destpath): path = "/".join(destpath.split(os.sep)) + arkpfx= "ark:/{0}/".format(ARK_NAAN) + if ediid.startswith(arkpfx): + # our convention is to omit the "ark:/88434/" prefix + ediid = ediid[len(arkpfx):] return self._distbase + ediid + '/' + urlencode(path) def assign_id(self, id, keep_conv=False): @@ -1092,7 +1096,6 @@ def _update_file_metadata(self, destpath, mdata, comptype, msg=None): msg = "Creating new %s: %s" % (comptype, destpath) mdata = self._update_md(orig, mdata) - out = self.bag.nerd_file_for(destpath) self._replace_file_metadata(destpath, mdata, msg) return mdata @@ -1258,7 +1261,7 @@ def add_data_file(self, destpath, srcpath, register=True, hardlink=False, add a data file into the bag at the given destination path. Metadata will be created for the file unless the register parameter is False. If a file already exists or is otherwise already registered for that - destination, the file and associated will be over-written. + destination, the file and associated metadata will be over-written. Metadata is created for the file using the register_data_file() method, and by default the file will be examined for extractable metadata. @@ -1337,9 +1340,10 @@ def add_data_file(self, destpath, srcpath, register=True, hardlink=False, def register_data_file(self, destpath, srcpath=None, examine=True, comptype=None, message=None): """ - create and install metadata into the bag for the given file to be + create and install metadata into the bag for the given file to be (newly) added at the given destination path. The file itself is not actually - inserted into the bag (see add_data_file()). + inserted into the bag (see add_data_file()). This will completely + overwrite any metadata for this file already registered. :param str destpath: the desired path for the file relative to the root of the dataset. @@ -1363,7 +1367,8 @@ def register_data_file(self, destpath, srcpath=None, examine=True, comptype = self._determine_file_comp_type(srcpath or destpath) if srcpath: - mdata = self.describe_data_file(srcpath, destpath, examine, comptype) + mdata = self.describe_data_file(srcpath, destpath, examine, + comptype, False) else: mdata = self.define_component(destpath, comptype) self._add_mediatype(destpath, mdata) @@ -1374,7 +1379,7 @@ def register_data_file(self, destpath, srcpath=None, examine=True, return self.replace_metadata_for(destpath, mdata, message) def describe_data_file(self, srcpath, destpath=None, examine=True, - comptype=None): + comptype=None, asupdate=True): """ examine the given file and return a metadata description of it. @@ -1394,6 +1399,13 @@ def describe_data_file(self, srcpath, destpath=None, examine=True, component. If not specified, the type will be discerned by examining the file (defaulting to "DataFile"). + :param bool asupdate: if True (default), the metadata generated will + by considered an update to the previously saved + metadata (if it exists) capturing changes due to + changes in the datafile itself; if False, the metadata + returned will not take into account previous metadata + as if assuming the file is being examined for the first + time. """ if not destpath: destpath = os.path.basename(srcpath) @@ -1402,7 +1414,11 @@ def describe_data_file(self, srcpath, destpath=None, examine=True, if not comptype: comptype = self._determine_file_comp_type(srcpath) - mdata = self._create_init_md_for(destpath, comptype) + if asupdate and self.bag and os.path.exists(self.bag.nerd_file_for(destpath)): + # TODO: what if comptype has changed? + mdata = self.bag.nerd_metadata_for(destpath, True) + else: + mdata = self._create_init_md_for(destpath, comptype) try: self._add_file_specs(srcpath, mdata) @@ -1467,12 +1483,18 @@ def _add_checksum(self, hash, mdata, algorithm='sha256', config=None): 'hash': hash } def _add_mediatype(self, dfile, mdata, config=None): + defmt = 'application/octet-stream' + if 'mediaType' in mdata and mdata['mediaType'] != defmt: + # we will not override the mediaType if it's already set to something + # specific + return + if not self._mimetypes: mtfile = pkg_resources.resource_filename('nistoar.pdr', 'data/mime.types') self._mimetypes = build_mime_type_map([mtfile]) mdata['mediaType'] = self._mimetypes.get(os.path.splitext(dfile)[1][1:], - 'application/octet-stream') + defmt) def _add_extracted_metadata(self, datafile, mdata, config=None): # deeper extraction not yet supported. diff --git a/python/tests/nistoar/pdr/preserv/bagit/test_builder.py b/python/tests/nistoar/pdr/preserv/bagit/test_builder.py index 1f624a9ac..c0b25de98 100644 --- a/python/tests/nistoar/pdr/preserv/bagit/test_builder.py +++ b/python/tests/nistoar/pdr/preserv/bagit/test_builder.py @@ -1016,6 +1016,7 @@ def test_describe_data_file(self): self.assertEqual(md['checksum'], {"algorithm": {'@type': 'Thing', "tag": "sha256" }, "hash": "d155d99281ace123351a311084cd8e34edda6a9afcddd76eb039bad479595ec9"}) + self.assertNotIn('downloadURL', md) # because bag does not exist md = self.bag.describe_data_file(srcfile, "goob/trial1.json", False) self.assertEqual(md['filepath'], "goob/trial1.json") @@ -1045,6 +1046,39 @@ def test_describe_data_file(self): "tag": "sha256" }, "hash": "d155d99281ace123351a311084cd8e34edda6a9afcddd76eb039bad479595ec9"}) + # if bag exists, downloadURL will be set + self.bag.ensure_bagdir() + self.bag.ediid = "goober" + md = self.bag.describe_data_file(srcfile, "foo/trial1.json") + self.assertEqual(md['filepath'], "foo/trial1.json") + self.assertIn('downloadURL', md) + self.assertTrue(md['downloadURL'].endswith("/od/ds/goober/foo/trial1.json")) + + self.bag.ediid = "ark:/88434/goober" + md = self.bag.describe_data_file(srcfile) + self.assertEqual(md['filepath'], "trial1.json") + self.assertIn('downloadURL', md) + self.assertTrue(md['downloadURL'].endswith("/od/ds/goober/trial1.json")) + + # don't override existing metadata + exturl = "https://example.com/goober/trial1.json" + md['downloadURL'] = exturl + md['title'] = "a fine file" + self.bag.update_metadata_for("trial1.json", md) + md = self.bag.describe_data_file(srcfile) + self.assertEqual(md['filepath'], "trial1.json") + self.assertIn('downloadURL', md) + self.assertEqual(md['downloadURL'], exturl) + self.assertEqual(md['title'], "a fine file") + + # override existing metadata when told to + self.bag.update_metadata_for("trial1.json", md) + md = self.bag.describe_data_file(srcfile, "foo/trial1.json", asupdate=False) + self.assertEqual(md['filepath'], "foo/trial1.json") + self.assertIn('downloadURL', md) + self.assertNotIn('title', md) + self.assertTrue(md['downloadURL'].endswith("/od/ds/goober/foo/trial1.json")) + def test_register_data_file(self): srcfile = os.path.join(datadir, "trial1.json") mddir = os.path.join(self.bag.bagdir, "metadata")