Skip to content

Commit

Permalink
Merge pull request #12 from bertsky/new-processor-api-input-file-errors
Browse files Browse the repository at this point in the history
error handling (input, processing/output, result/overwrite)
  • Loading branch information
bertsky authored Aug 21, 2024
2 parents 0adb9fb + 8077d45 commit 6b68f7a
Show file tree
Hide file tree
Showing 21 changed files with 393 additions and 187 deletions.
7 changes: 7 additions & 0 deletions docs/api/ocrd/ocrd.processor.ocrd_page_result.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
ocrd.processor.ocrd\_page\_result module
========================================

.. automodule:: ocrd.processor.ocrd_page_result
:members:
:undoc-members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/api/ocrd/ocrd.processor.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ Submodules

ocrd.processor.base
ocrd.processor.helpers
ocrd.processor.ocrd_page_result
8 changes: 8 additions & 0 deletions src/ocrd/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@
\b
{config.describe('OCRD_DOWNLOAD_TIMEOUT')}
\b
{config.describe('OCRD_DOWNLOAD_INPUT')}
\b
{config.describe('OCRD_MISSING_INPUT')}
\b
{config.describe('OCRD_MISSING_OUTPUT')}
\b
{config.describe('OCRD_EXISTING_OUTPUT')}
\b
{config.describe('OCRD_METS_CACHING')}
\b
{config.describe('OCRD_MAX_PROCESSOR_CACHE')}
Expand Down
66 changes: 31 additions & 35 deletions src/ocrd/cli/bashlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@
from ocrd.decorators import (
parameter_option,
parameter_override_option,
ocrd_loglevel
ocrd_loglevel,
ocrd_cli_wrap_processor
)
from ocrd_utils import (
is_local_filename,
get_local_filename,
initLogging,
make_file_id
getLogger,
make_file_id,
config
)
from ocrd.resolver import Resolver
from ocrd.processor import Processor
Expand Down Expand Up @@ -81,11 +84,15 @@ def bashlib_constants(name):
@bashlib_cli.command('input-files')
@click.option('-m', '--mets', help="METS to process", default=DEFAULT_METS_BASENAME)
@click.option('-w', '--working-dir', help="Working Directory")
@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT')
@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT')
@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default=None)
@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default=None)
# repeat some other processor options for convenience (will be ignored here)
@click.option('-g', '--page-id', help="ID(s) of the pages to process")
@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist")
@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist\n"
"(with '--page-id', remove only those).\n"
"Short-hand for OCRD_EXISTING_OUTPUT=OVERWRITE")
@click.option('--debug', is_flag=True, default=False, help="Abort on any errors with full stack trace.\n"
"Short-hand for OCRD_MISSING_OUTPUT=ABORT")
@parameter_option
@parameter_override_option
@ocrd_loglevel
Expand All @@ -100,37 +107,26 @@ def bashlib_input_files(**kwargs):
(The printing format is one associative array initializer per line.)
"""
initLogging()
mets = kwargs.pop('mets')
working_dir = kwargs.pop('working_dir')
if is_local_filename(mets) and not isfile(get_local_filename(mets)):
msg = "File does not exist: %s" % mets
raise FileNotFoundError(msg)
resolver = Resolver()
workspace = resolver.workspace_from_url(mets, working_dir)
class BashlibProcessor(Processor):
@property
def ocrd_tool(self):
return {}
return {'executable': '', 'steps': ['']}
@property
def executable(self):
return ''
processor = BashlibProcessor(None)
# go half way of the normal run_processor / process_workspace call tree
processor.workspace = workspace
processor.page_id = kwargs['page_id']
processor.input_file_grp = kwargs['input_file_grp']
processor.output_file_grp = kwargs['output_file_grp']
for input_files in processor.zip_input_files(mimetype=None, on_error='abort'):
# ensure all input files exist locally (without persisting them in the METS)
# - this mimics the default behaviour of all Pythonic processors
input_files = [workspace.download_file(input_file) if input_file else None
for input_file in input_files]
for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']:
# make this bash-friendly (show initialization for associative array)
if len(input_files) > 1:
# single quotes allow us to preserve the list value inside the alist
print("[%s]='%s'" % (field, ' '.join(str(getattr(res, field)) for res in input_files)), end=' ')
else:
print("[%s]='%s'" % (field, str(getattr(input_files[0], field))), end=' ')
print("[outputFileId]='%s'" % make_file_id(input_files[0], kwargs['output_file_grp']))
def version(self):
return '1.0'
# go half way of the normal run_processor / process_workspace call tree
# by just delegating to process_workspace, overriding process_page_file
# to ensure all input files exist locally (without persisting them in the METS)
# and print what needs to be acted on in bash-friendly way
def process_page_file(self, *input_files):
for field in ['url', 'local_filename', 'ID', 'mimetype', 'pageId']:
# make this bash-friendly (show initialization for associative array)
if len(input_files) > 1:
# single quotes allow us to preserve the list value inside the alist
value = ' '.join(str(getattr(res, field)) for res in input_files)
else:
value = str(getattr(input_files[0], field))
print(f"[{field}]='{value}'", end=' ')
output_file_id = make_file_id(input_files[0], kwargs['output_file_grp'])
print(f"[outputFileId]='{output_file_id}'")
ocrd_cli_wrap_processor(BashlibProcessor, **kwargs)
4 changes: 2 additions & 2 deletions src/ocrd/cli/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def validate_cli():
@click.argument('ocrd_tool', required=False, nargs=1)
def validate_ocrd_tool(ocrd_tool):
'''
Validate OCRD_TOOL as an ocrd-tool.json file.
Validate OCRD_TOOL as an `ocrd-tool.json` file.
'''
if not ocrd_tool:
ocrd_tool = 'ocrd-tool.json'
Expand Down Expand Up @@ -107,7 +107,7 @@ def validate_page(page, **kwargs):
@click.argument('tasks', nargs=-1, required=True)
def validate_process(tasks, workspace, mets_basename, overwrite, page_id):
'''
Validate a sequence of tasks passable to 'ocrd process'
Validate a sequence of tasks passable to `ocrd process`
'''
if workspace:
_inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks],
Expand Down
1 change: 1 addition & 0 deletions src/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
} | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<local_filename>.*)' \\
-G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
"""
log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
workspace = Workspace(
Expand Down
22 changes: 4 additions & 18 deletions src/ocrd/decorators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor(
profile_file=None,
version=False,
overwrite=False,
debug=False,
resolve_resource=None,
show_resource=None,
list_resources=False,
Expand Down Expand Up @@ -117,25 +118,10 @@ def resolve(name):
resolver.resolve_mets_arguments(working_dir, mets, None, mets_server_url)
workspace = resolver.workspace_from_url(mets, working_dir, mets_server_url=mets_server_url)
page_id = kwargs.get('page_id')
# XXX not possible while processors do not adhere to # https://github.com/OCR-D/core/issues/505
# if overwrite
# if 'output_file_grp' not in kwargs or not kwargs['output_file_grp']:
# raise Exception("--overwrite requires --output-file-grp")
# LOG.info("Removing files because of --overwrite")
# for grp in kwargs['output_file_grp'].split(','):
# if page_id:
# for one_page_id in kwargs['page_id'].split(','):
# LOG.debug("Removing files in output file group %s with page ID %s", grp, one_page_id)
# for file in workspace.mets.find_files(pageId=one_page_id, fileGrp=grp):
# workspace.remove_file(file, force=True, keep_file=False, page_recursive=True)
# else:
# LOG.debug("Removing all files in output file group %s ", grp)
# # TODO: can be reduced to `page_same_group=True` as soon as core#505 has landed (in all processors)
# workspace.remove_file_group(grp, recursive=True, force=True, keep_files=False, page_recursive=True, page_same_group=False)
# workspace.save_mets()
# XXX While https://github.com/OCR-D/core/issues/505 is open, set 'overwrite_mode' globally on the workspace
if overwrite:
workspace.overwrite_mode = True
config.OCRD_EXISTING_OUTPUT = 'OVERWRITE'
if debug:
config.OCRD_MISSING_OUTPUT = 'ABORT'
report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id)
if not report.is_valid:
raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors))
Expand Down
1 change: 1 addition & 0 deletions src/ocrd/decorators/ocrd_cli_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def cli(mets_url):
option('-O', '--output-file-grp', default=None),
option('-g', '--page-id'),
option('--overwrite', is_flag=True, default=False),
option('--debug', is_flag=True, default=False),
option('--profile', is_flag=True, default=False),
option('--profile-file', type=Path(dir_okay=False, writable=True)),
parameter_option,
Expand Down
2 changes: 1 addition & 1 deletion src/ocrd/decorators/parameter_option.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def _handle_param_option(ctx, param, value):
parameter_option = option('-p', '--parameter',
help="Parameters, either JSON string or path to JSON file",
multiple=True,
default=['{}'],
default=[],
# now handled in ocrd_cli_wrap_processor to resolve processor preset files
# callback=_handle_param_option
callback=lambda ctx, param, kv: list(kv))
Expand Down
4 changes: 3 additions & 1 deletion src/ocrd/processor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from .base import (
Processor,
ResourceNotFoundError
ResourceNotFoundError,
NonUniqueInputFile,
MissingInputFile,
)
from .ocrd_page_result import (
OcrdPageResult,
Expand Down
Loading

0 comments on commit 6b68f7a

Please sign in to comment.