diff --git a/Makefile b/Makefile index 0608b0b73..39b46ee84 100644 --- a/Makefile +++ b/Makefile @@ -238,9 +238,9 @@ repo/assets repo/spec: always-update .PHONY: spec # Copy JSON Schema, OpenAPI from OCR-D/spec -spec: repo/spec - cp repo/spec/ocrd_tool.schema.yml ocrd_validators/ocrd_validators/ocrd_tool.schema.yml - cp repo/spec/bagit-profile.yml ocrd_validators/ocrd_validators/bagit-profile.yml +spec: # repo/spec + cp repo/spec/ocrd_tool.schema.yml src/ocrd_validators/ocrd_tool.schema.yml + cp repo/spec/bagit-profile.yml src/ocrd_validators/bagit-profile.yml # # Assets diff --git a/repo/spec b/repo/spec index 2bbd4dd91..cb1ba2e72 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 2bbd4dd916519f567e5c648b24c0b5ca6fc8a183 +Subproject commit cb1ba2e72bd176f1a1076eea38d6438c647e68e7 diff --git a/src/ocrd/processor/__init__.py b/src/ocrd/processor/__init__.py index 21b0c69eb..0b3ce5a56 100644 --- a/src/ocrd/processor/__init__.py +++ b/src/ocrd/processor/__init__.py @@ -2,6 +2,10 @@ Processor, ResourceNotFoundError ) +from .ocrd_page_result import ( + OcrdPageResult, + OcrdPageResultImage +) from .helpers import ( run_cli, run_processor, diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 43aec4ace..46417ac2c 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -9,13 +9,13 @@ 'run_processor' ] -from os.path import exists +from os.path import exists, join from shutil import copyfileobj import json import os from os import getcwd from pathlib import Path -from typing import Optional +from typing import List, Optional, Union import sys import inspect import tarfile @@ -23,6 +23,9 @@ from deprecated import deprecated from ocrd.workspace import Workspace +from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile +from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( VERSION as OCRD_VERSION, MIMETYPE_PAGE, @@ -198,10 +201,12 @@ def verify(self): assert self.output_file_grp is not None input_file_grps = self.input_file_grp.split(',') output_file_grps = self.output_file_grp.split(',') - def assert_file_grp_cardinality(grps, spec, msg): - if isinstance(spec, int) and spec > 0: - assert len(grps) == spec, msg % (len(grps), str(spec)) + def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg): + if isinstance(spec, int): + if spec > 0: + assert len(grps) == spec, msg % (len(grps), str(spec)) else: + assert isinstance(spec, list) minimum = spec[0] maximum = spec[1] if minimum > 0: @@ -289,7 +294,7 @@ def process_workspace(self, workspace: Workspace) -> None: # - ResourceNotFoundError → use ResourceManager to download (once), then retry # - transient (I/O or OOM) error → maybe sleep, retry # - persistent (data) error → skip / dummy / raise - input_files = [None] * len(input_file_tuple) + input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple) for i, input_file in enumerate(input_file_tuple): if i == 0: log.info("processing page %s", input_file.pageId) @@ -309,7 +314,7 @@ def process_workspace(self, workspace: Workspace) -> None: # fall back to deprecated method self.process() - def process_page_file(self, *input_files) -> None: + def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: """ Process the given ``input_files`` of the :py:attr:`workspace`, representing one physical page (passed as one opened @@ -321,49 +326,55 @@ def process_page_file(self, *input_files) -> None: to handle cases like multiple fileGrps, non-PAGE input etc.) """ log = getLogger('ocrd.processor.base') - input_pcgts = [None] * len(input_files) + input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files) + assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile)) page_id = input_files[0].pageId for i, input_file in enumerate(input_files): - # FIXME: what about non-PAGE input like image or JSON ??? + assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile)) log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId) try: - input_pcgts[i] = page_from_file(input_file) + page_ = page_from_file(input_file) + assert isinstance(page_, PcGtsType) + input_pcgts[i] = page_ except ValueError as e: log.info("non-PAGE input for page %s: %s", page_id, e) output_file_id = make_file_id(input_files[0], self.output_file_grp) - output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id) - if isinstance(output_pcgts, (list, tuple)): - output_images = output_pcgts[1:] - output_pcgts = output_pcgts[0] - for output_image_pil, output_image_id, output_image_path in output_images: - self.workspace.save_image_file( - output_image_pil, - output_image_id, - self.output_file_grp, - page_id=page_id, - file_path=output_image_path) - output_pcgts.set_pcGtsId(output_file_id) - self.add_metadata(output_pcgts) + result = self.process_page_pcgts(*input_pcgts, page_id=page_id) + for image_result in result.images: + image_file_id = f'{output_file_id}_{image_result.file_id_suffix}' + image_file_path = join(self.output_file_grp, f'{image_file_id}.png') + image_result.alternative_image.set_filename(image_file_path) + self.workspace.save_image_file( + image_result.pil, + image_file_id, + self.output_file_grp, + page_id=page_id, + file_path=image_file_path) + result.pcgts.set_pcGtsId(output_file_id) + self.add_metadata(result.pcgts) # FIXME: what about non-PAGE output like JSON ??? self.workspace.add_file(file_id=output_file_id, file_grp=self.output_file_grp, page_id=page_id, local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'), mimetype=MIMETYPE_PAGE, - content=to_xml(output_pcgts)) + content=to_xml(result.pcgts)) - def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """ Process the given ``input_pcgts`` of the :py:attr:`workspace`, representing one physical page (passed as one parsed :py:class:`~ocrd_models.OcrdPage` per input fileGrp) under the given :py:attr:`parameter`, and return the - resulting :py:class:`~ocrd_models.OcrdPage`. + resulting :py:class:`~ocrd.processor.OcrdPageResult`. - Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage` - and one or more lists or tuples of :py:class:`PIL.Image` (image data), - :py:class:str (file ID) and :py:class:str (file path) of derived images - to be annotated along with the resulting PAGE file. + Optionally, add to the ``images`` attribute of the resulting + :py:class:`~ocrd.processor.OcrdPageResult` instances + of :py:class:`~ocrd.processor.OcrdPageResultImage`, + which have required fields for ``pil`` (:py:class:`PIL.Image` image data), + ``file_id_suffix`` (used for generating IDs of the saved image) and + ``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType` + for setting the filename of the saved image). (This contains the main functionality and must be overridden by subclasses.) """ @@ -374,7 +385,9 @@ def add_metadata(self, pcgts: OcrdPage) -> None: Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. """ - pcgts.get_Metadata().add_MetadataItem( + metadata_obj = pcgts.get_Metadata() + assert metadata_obj is not None + metadata_obj.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=self.ocrd_tool['executable'], diff --git a/src/ocrd/processor/builtin/dummy_processor.py b/src/ocrd/processor/builtin/dummy_processor.py index b05ca9e6d..5ef76d2fa 100644 --- a/src/ocrd/processor/builtin/dummy_processor.py +++ b/src/ocrd/processor/builtin/dummy_processor.py @@ -1,14 +1,17 @@ # pylint: disable=missing-module-docstring,invalid-name from os.path import join, basename +from typing import Optional, Union import click from ocrd import Processor from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_models.ocrd_page import to_xml +from ocrd.processor.ocrd_page_result import OcrdPageResult +from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile +from ocrd_models.ocrd_page import OcrdPage, to_xml +from ocrd_models.ocrd_page_generateds import PcGtsType from ocrd_utils import ( getLogger, - assert_file_grp_cardinality, make_file_id, MIME_TO_EXT, MIMETYPE_PAGE, @@ -24,13 +27,16 @@ class DummyProcessor(Processor): Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group """ - def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + assert input_pcgts[0] # nothing to do here - return input_pcgts[0] + return OcrdPageResult(input_pcgts[0]) - def process_page_file(self, *input_files): + def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None: LOG = getLogger('ocrd.dummy') input_file = input_files[0] + assert input_file + assert input_file.local_filename if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE: # we need to mimic the actual copying in addition to the PAGE boilerplate file_id = make_file_id(input_file, self.output_file_grp) @@ -48,7 +54,8 @@ def process_page_file(self, *input_files): content=content) file_id = file_id + '_PAGE' pcgts = page_from_file(output_file) - pcgts = self.process_page_pcgts(pcgts) + assert isinstance(pcgts, PcGtsType) + pcgts = self.process_page_pcgts(pcgts).pcgts pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file) diff --git a/src/ocrd/processor/ocrd_page_result.py b/src/ocrd/processor/ocrd_page_result.py new file mode 100644 index 000000000..c63330c73 --- /dev/null +++ b/src/ocrd/processor/ocrd_page_result.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass, field +from typing import List +from ocrd_models.ocrd_page import OcrdPage +from PIL.Image import Image + +from ocrd_models.ocrd_page_generateds import AlternativeImageType + +@dataclass +class OcrdPageResultImage(): + pil : Image + file_id_suffix : str + alternative_image : AlternativeImageType + +@dataclass +class OcrdPageResult(): + pcgts : OcrdPage + images : List[OcrdPageResultImage] = field(default_factory=list) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index fc619b7d0..eeaa6434f 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1073,7 +1073,7 @@ def image_from_segment(self, segment, parent_image, parent_coords, return segment_image, segment_coords # pylint: disable=redefined-builtin - def save_image_file(self, image : Image, + def save_image_file(self, image : Image.Image, file_id : str, file_grp : str, file_path : Optional[str] = None, diff --git a/src/ocrd_modelfactory/__init__.py b/src/ocrd_modelfactory/__init__.py index 7afc5b176..a98499b2e 100644 --- a/src/ocrd_modelfactory/__init__.py +++ b/src/ocrd_modelfactory/__init__.py @@ -79,7 +79,7 @@ def page_from_image(input_file, with_tree=False): revmap = dict(((node, element) for element, node in mapping.items())) return pcgts, etree, mapping, revmap -def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]: +def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]: """ Create :py:class:`~ocrd_models.ocrd_page.OcrdPage` from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec..330fefe97 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -5,5 +5,6 @@ from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile, ClientSideOcrdFile from .ocrd_mets import OcrdMets +from .ocrd_page import OcrdPage from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport diff --git a/src/ocrd_validators/ocrd_tool.schema.yml b/src/ocrd_validators/ocrd_tool.schema.yml index db1b61458..5de65a04e 100644 --- a/src/ocrd_validators/ocrd_tool.schema.yml +++ b/src/ocrd_validators/ocrd_tool.schema.yml @@ -11,7 +11,7 @@ properties: type: string pattern: '^[0-9]+\.[0-9]+\.[0-9]+$' git_url: - description: Github/Gitlab URL + description: GitHub/GitLab URL type: string format: url dockerhub: @@ -37,14 +37,14 @@ properties: type: string input_file_grp: deprecated: true - description: Input fileGrp@USE this tool expects by default + description: (DEPRECATED) Input fileGrp@USE this tool expects by default type: array items: type: string # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: deprecated: true - description: Output fileGrp@USE this tool produces by default + description: (DEPRECATED) Output fileGrp@USE this tool produces by default type: array items: type: string @@ -52,31 +52,26 @@ properties: input_file_grp_cardinality: description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 - additionalProperties: false output_file_grp_cardinality: description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited) oneOf: - - items: + - type: number + multipleOf: 1 + - type: array + items: type: number multipleOf: 1 - - items: - type: array - items: - type: number - multipleOf: 1 - minItems: 2 - maxItems: 2 + minItems: 2 + maxItems: 2 default: 1 parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. @@ -152,9 +147,9 @@ properties: description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." default: false description: - description: Concise description what the tool does + description: Concise description of what the tool does categories: - description: Tools belong to this categories, representing modules within the OCR-D project structure + description: Tools belong to these categories, representing modules within the OCR-D project structure type: array items: type: string @@ -229,7 +224,7 @@ properties: default: 'as-is' path_in_archive: type: string - description: if type is archive, the resource is at this location in the archive + description: If type is archive, the resource is at this location in the archive default: '.' version_range: type: string @@ -237,4 +232,4 @@ properties: default: '>= 0.0.1' size: type: number - description: Size of the resource in bytes + description: "Size of the resource in bytes to be retrieved (for archives: size of the archive)"