Skip to content

Commit

Permalink
Merge pull request #136 from usnistgov/fix/ODD881-dlurl
Browse files Browse the repository at this point in the history
ODD-881: don't let file examination override download URL
  • Loading branch information
RayPlante authored May 17, 2020
2 parents 42900e5 + adef226 commit a57bc53
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 8 deletions.
38 changes: 30 additions & 8 deletions python/nistoar/pdr/preserv/bagit/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,10 @@ def _upd_downloadurl(self, ediid):

def _download_url(self, ediid, destpath):
path = "/".join(destpath.split(os.sep))
arkpfx= "ark:/{0}/".format(ARK_NAAN)
if ediid.startswith(arkpfx):
# our convention is to omit the "ark:/88434/" prefix
ediid = ediid[len(arkpfx):]
return self._distbase + ediid + '/' + urlencode(path)

def assign_id(self, id, keep_conv=False):
Expand Down Expand Up @@ -1092,7 +1096,6 @@ def _update_file_metadata(self, destpath, mdata, comptype, msg=None):
msg = "Creating new %s: %s" % (comptype, destpath)

mdata = self._update_md(orig, mdata)
out = self.bag.nerd_file_for(destpath)
self._replace_file_metadata(destpath, mdata, msg)
return mdata

Expand Down Expand Up @@ -1258,7 +1261,7 @@ def add_data_file(self, destpath, srcpath, register=True, hardlink=False,
add a data file into the bag at the given destination path. Metadata
will be created for the file unless the register parameter is False.
If a file already exists or is otherwise already registered for that
destination, the file and associated will be over-written.
destination, the file and associated metadata will be over-written.
Metadata is created for the file using the register_data_file() method,
and by default the file will be examined for extractable metadata.
Expand Down Expand Up @@ -1337,9 +1340,10 @@ def add_data_file(self, destpath, srcpath, register=True, hardlink=False,
def register_data_file(self, destpath, srcpath=None, examine=True,
comptype=None, message=None):
"""
create and install metadata into the bag for the given file to be
create and install metadata into the bag for the given file to be (newly)
added at the given destination path. The file itself is not actually
inserted into the bag (see add_data_file()).
inserted into the bag (see add_data_file()). This will completely
overwrite any metadata for this file already registered.
:param str destpath: the desired path for the file relative to the
root of the dataset.
Expand All @@ -1363,7 +1367,8 @@ def register_data_file(self, destpath, srcpath=None, examine=True,
comptype = self._determine_file_comp_type(srcpath or destpath)

if srcpath:
mdata = self.describe_data_file(srcpath, destpath, examine, comptype)
mdata = self.describe_data_file(srcpath, destpath, examine,
comptype, False)
else:
mdata = self.define_component(destpath, comptype)
self._add_mediatype(destpath, mdata)
Expand All @@ -1374,7 +1379,7 @@ def register_data_file(self, destpath, srcpath=None, examine=True,
return self.replace_metadata_for(destpath, mdata, message)

def describe_data_file(self, srcpath, destpath=None, examine=True,
comptype=None):
comptype=None, asupdate=True):
"""
examine the given file and return a metadata description of it.
Expand All @@ -1394,6 +1399,13 @@ def describe_data_file(self, srcpath, destpath=None, examine=True,
component. If not specified, the type will be
discerned by examining the file (defaulting
to "DataFile").
:param bool asupdate: if True (default), the metadata generated will
by considered an update to the previously saved
metadata (if it exists) capturing changes due to
changes in the datafile itself; if False, the metadata
returned will not take into account previous metadata
as if assuming the file is being examined for the first
time.
"""
if not destpath:
destpath = os.path.basename(srcpath)
Expand All @@ -1402,7 +1414,11 @@ def describe_data_file(self, srcpath, destpath=None, examine=True,
if not comptype:
comptype = self._determine_file_comp_type(srcpath)

mdata = self._create_init_md_for(destpath, comptype)
if asupdate and self.bag and os.path.exists(self.bag.nerd_file_for(destpath)):
# TODO: what if comptype has changed?
mdata = self.bag.nerd_metadata_for(destpath, True)
else:
mdata = self._create_init_md_for(destpath, comptype)

try:
self._add_file_specs(srcpath, mdata)
Expand Down Expand Up @@ -1467,12 +1483,18 @@ def _add_checksum(self, hash, mdata, algorithm='sha256', config=None):
'hash': hash
}
def _add_mediatype(self, dfile, mdata, config=None):
defmt = 'application/octet-stream'
if 'mediaType' in mdata and mdata['mediaType'] != defmt:
# we will not override the mediaType if it's already set to something
# specific
return

if not self._mimetypes:
mtfile = pkg_resources.resource_filename('nistoar.pdr',
'data/mime.types')
self._mimetypes = build_mime_type_map([mtfile])
mdata['mediaType'] = self._mimetypes.get(os.path.splitext(dfile)[1][1:],
'application/octet-stream')
defmt)

def _add_extracted_metadata(self, datafile, mdata, config=None):
# deeper extraction not yet supported.
Expand Down
34 changes: 34 additions & 0 deletions python/tests/nistoar/pdr/preserv/bagit/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,6 +1016,7 @@ def test_describe_data_file(self):
self.assertEqual(md['checksum'], {"algorithm": {'@type': 'Thing',
"tag": "sha256" },
"hash": "d155d99281ace123351a311084cd8e34edda6a9afcddd76eb039bad479595ec9"})
self.assertNotIn('downloadURL', md) # because bag does not exist

md = self.bag.describe_data_file(srcfile, "goob/trial1.json", False)
self.assertEqual(md['filepath'], "goob/trial1.json")
Expand Down Expand Up @@ -1045,6 +1046,39 @@ def test_describe_data_file(self):
"tag": "sha256" },
"hash": "d155d99281ace123351a311084cd8e34edda6a9afcddd76eb039bad479595ec9"})

# if bag exists, downloadURL will be set
self.bag.ensure_bagdir()
self.bag.ediid = "goober"
md = self.bag.describe_data_file(srcfile, "foo/trial1.json")
self.assertEqual(md['filepath'], "foo/trial1.json")
self.assertIn('downloadURL', md)
self.assertTrue(md['downloadURL'].endswith("/od/ds/goober/foo/trial1.json"))

self.bag.ediid = "ark:/88434/goober"
md = self.bag.describe_data_file(srcfile)
self.assertEqual(md['filepath'], "trial1.json")
self.assertIn('downloadURL', md)
self.assertTrue(md['downloadURL'].endswith("/od/ds/goober/trial1.json"))

# don't override existing metadata
exturl = "https://example.com/goober/trial1.json"
md['downloadURL'] = exturl
md['title'] = "a fine file"
self.bag.update_metadata_for("trial1.json", md)
md = self.bag.describe_data_file(srcfile)
self.assertEqual(md['filepath'], "trial1.json")
self.assertIn('downloadURL', md)
self.assertEqual(md['downloadURL'], exturl)
self.assertEqual(md['title'], "a fine file")

# override existing metadata when told to
self.bag.update_metadata_for("trial1.json", md)
md = self.bag.describe_data_file(srcfile, "foo/trial1.json", asupdate=False)
self.assertEqual(md['filepath'], "foo/trial1.json")
self.assertIn('downloadURL', md)
self.assertNotIn('title', md)
self.assertTrue(md['downloadURL'].endswith("/od/ds/goober/foo/trial1.json"))

def test_register_data_file(self):
srcfile = os.path.join(datadir, "trial1.json")
mddir = os.path.join(self.bag.bagdir, "metadata")
Expand Down

0 comments on commit a57bc53

Please sign in to comment.