Skip to content

Commit

Permalink
Merge pull request #217 from kba/cli-validate-no-download
Browse files Browse the repository at this point in the history
Cli validate no download
  • Loading branch information
kba authored Nov 29, 2018
2 parents 7c45515 + ec60083 commit 391b11e
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 29 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Changed:

* various fixes to workspace and OCRD-ZIP validation, #217

## [0.13.1] - 2018-11-26

Fixed:
Expand Down
6 changes: 4 additions & 2 deletions ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ def workspace_cli(ctx, directory, mets_basename, backup):
''')
@pass_workspace
@click.option('-a', '--download', is_flag=True, help="Download all files")
@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(['mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density']))
@click.argument('mets_url')
def validate_workspace(ctx, mets_url=None):
report = WorkspaceValidator.validate_url(ctx.resolver, mets_url, src_dir=ctx.directory)
def validate_workspace(ctx, mets_url, download, skip):
report = WorkspaceValidator.validate_url(ctx.resolver, mets_url, src_dir=ctx.directory, skip=skip, download=download)
print(report.to_xml())
if not report.is_valid:
sys.exit(128)
Expand Down
5 changes: 3 additions & 2 deletions ocrd/model/ocrd_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@ class OcrdFile(object):
# def create(mimetype, ID, url, local_filename):
# el_fileGrp.SubElement('file')

def __init__(self, el, mimetype=None, instance=None, local_filename=None, workspace=None):
def __init__(self, el, mimetype=None, instance=None, local_filename=None, baseurl=''):
if el is None:
el = ET.Element(TAG_METS_FILE)
self._el = el
self.mimetype = mimetype
self.local_filename = local_filename
if baseurl and not local_filename and '://' not in self.url:
self.local_filename = '%s/%s' % (baseurl, self.url)

self._instance = instance
self.workspace = workspace

def __str__(self):
# props = '\n\t'.join([
Expand Down
11 changes: 9 additions & 2 deletions ocrd/model/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,18 @@ def empty_mets():
tpl = tpl.replace('{{ NOW }}', '%s' % datetime.now())
return OcrdMets(content=tpl.encode('utf-8'))

def __init__(self, file_by_id=None, **kwargs):
def __init__(self, file_by_id=None, baseurl='', **kwargs):
"""
Arguments:
file_by_id (dict): Cache mapping file ID to OcrdFile
baseurl (string, ''): Base URL to prepend to relative file URL
"""
super(OcrdMets, self).__init__(**kwargs)
if file_by_id is None:
file_by_id = {}
self._file_by_id = file_by_id
self.baseurl = baseurl

def __str__(self):
return 'OcrdMets[fileGrps=%s,files=%s]' % (self.file_groups, self.find_files())
Expand Down Expand Up @@ -100,7 +107,7 @@ def find_files(self, ID=None, fileGrp=None, groupId=None, mimetype=None, local_o
for el in file_els:
file_id = el.get('ID')
if file_id not in self._file_by_id:
self._file_by_id[file_id] = OcrdFile(el)
self._file_by_id[file_id] = OcrdFile(el, baseurl=self.baseurl)
if local_only:
url = el.find('mets:FLocat', NS).get('{%s}href' % NS['xlink'])
if not url.startswith('file://'):
Expand Down
6 changes: 3 additions & 3 deletions ocrd/validator/ocrd_zip_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,18 @@ def _validate_bag(self, bag, **kwargs):
"""
Validate BagIt (checksums, payload.oxum etc)
"""
failed = False
failed = None
try:
bag.validate(**kwargs)
except BagValidationError as e:
failed = True
failed = e
for d in e.details:
if isinstance(d, ChecksumMismatch):
log.error("Validation Error: expected %s to have %s checksum of %s but found %s", d.path, d.algorithm, d.expected, d.found)
else:
log.error("Validation Error: %s", d)
if failed:
raise BagValidationError("bagit validation failed")
raise BagValidationError("%s" % failed)

def _validate_workspace(self, bag):
"""
Expand Down
10 changes: 8 additions & 2 deletions ocrd/validator/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,18 @@ def __init__(self):
self.entries = []
self.warnings = []
self.errors = []
self.notices = []

def __str__(self):
ret = 'OK' if self.is_valid else 'INVALID'
if not self.is_valid:
if not self.is_valid or self.notices:
ret += '['
if self.warnings:
ret += ' %s warnings' % len(self.warnings)
if self.errors:
ret += ' %s errors' % len(self.errors)
if self.notices:
ret += ' %s notices' % len(self.notices)
ret += ' ]'
return ret

Expand All @@ -31,7 +34,7 @@ def is_valid(self):

def to_xml(self):
body = ''
for k in ['warning', 'error']:
for k in ['warning', 'error', 'notice']:
for msg in self.__dict__[k + 's']:
body += '\n <%s>%s</%s>' % (k, msg, k)
return '<report valid="%s">%s\n</report>' % ("true" if self.is_valid else "false", body)
Expand All @@ -41,3 +44,6 @@ def add_warning(self, msg):

def add_error(self, msg):
self.errors.append(msg)

def add_notice(self, msg):
self.notices.append(msg)
49 changes: 33 additions & 16 deletions ocrd/validator/workspace_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,51 +13,68 @@
class WorkspaceValidator(object):
"""
Validates an OCR-D/METS workspace against the specs.
Args:
resolver (:class:`Resolver`) : Instance of a resolver
mets_url (string) : URL of the METS file
"""

def __init__(self, resolver, mets_url, src_dir=None):
self.resolver = resolver
self.mets_url = mets_url
def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False):
self.report = ValidationReport()
self.skip = skip if skip else []
log.debug('resolver=%s mets_url=%s src_dir=%s', resolver, mets_url, src_dir)
self.resolver = resolver
self.mets_url = mets_url
self.download = download
self.src_dir = src_dir
if mets_url is None and src_dir is not None:
mets_url = '%s/mets.xml' % src_dir
self.workspace = self.resolver.workspace_from_url(mets_url, src_dir=src_dir)
self.mets = self.workspace.mets
self.workspace = None
self.mets = None

@staticmethod
def validate_url(resolver, mets_url, src_dir=None):
def validate_url(*args, **kwargs):
"""
Validates the workspace of a METS URL against the specs
Arguments:
resolver (:class:`ocrd.Resolver`): Resolver
mets_url (string): URL of the METS file
src_dir (string, None): Directory containing mets file
skip (list): Tests to skip. One or more of 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density'
download (boolean): Whether to download files
Returns:
report (:class:`ValidationReport`) Report on the validity
"""
validator = WorkspaceValidator(resolver, mets_url, src_dir=src_dir)
validator = WorkspaceValidator(*args, **kwargs)
return validator.validate()

def validate(self):
self._validate_mets_unique_identifier()
self._validate_mets_file_group_names()
self._validate_mets_files()
self._validate_pixel_density()
try:
self._resolve_workspace()
if 'mets_unique_identifier' not in self.skip:
self._validate_mets_unique_identifier()
if 'mets_file_group_names' not in self.skip:
self._validate_mets_file_group_names()
if 'mets_files' not in self.skip:
self._validate_mets_files()
if 'pixel_density' not in self.skip:
self._validate_pixel_density()
except Exception as e: # pylint: disable=broad-except
self.report.add_error("Failed to instantiate workspace: %s" % e)
return self.report

def _resolve_workspace(self):
if self.workspace is None:
self.workspace = self.resolver.workspace_from_url(self.mets_url, src_dir=self.src_dir, download=self.download)
self.mets = self.workspace.mets

def _validate_mets_unique_identifier(self):
if self.mets.unique_identifier is None:
self.report.add_error("METS has no unique identifier")

def _validate_pixel_density(self):
for f in [f for f in self.mets.find_files() if f.mimetype.startswith('image/')]:
if not f.local_filename and not self.download:
self.report.add_notice("Won't download remote image <%s>" % f.url)
continue
exif = self.workspace.resolve_image_exif(f.url)
for k in ['xResolution', 'yResolution']:
v = exif.__dict__.get(k)
Expand All @@ -67,7 +84,7 @@ def _validate_pixel_density(self):
def _validate_mets_file_group_names(self):
for fileGrp in self.mets.file_groups:
if not fileGrp.startswith(FILE_GROUP_PREFIX):
self.report.add_warning("fileGrp USE does not begin with '%s': %s" % (FILE_GROUP_PREFIX, fileGrp))
self.report.add_notice("fileGrp USE does not begin with '%s': %s" % (FILE_GROUP_PREFIX, fileGrp))
else:
# OCR-D-FOO-BAR -> ('FOO', 'BAR')
# \____/\_/ \_/
Expand Down
3 changes: 1 addition & 2 deletions ocrd/workspace.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
from os.path import dirname
import sys
import shutil

Expand Down Expand Up @@ -32,7 +31,7 @@ def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', aut
self.directory = directory
self.mets_target = os.path.join(directory, mets_basename)
if mets is None:
mets = OcrdMets(filename=self.mets_target)
mets = OcrdMets(filename=self.mets_target, baseurl=directory)
self.mets = mets
self.automatic_backup = automatic_backup
self.src_dir = src_dir
Expand Down

0 comments on commit 391b11e

Please sign in to comment.