Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Processor result object #8

Merged
merged 16 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 32 additions & 20 deletions src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,23 @@
'run_processor'
]

from os.path import exists
from os.path import exists, join
from shutil import copyfileobj
import json
import os
from os import getcwd
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Union
import sys
import inspect
import tarfile
import io
from deprecated import deprecated

from ocrd.workspace import Workspace
from ocrd_models.ocrd_file import OcrdFile
from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
from ocrd.processor.ocrd_page_result import OcrdPageResult
from ocrd_models.ocrd_page_generateds import PcGtsType
from ocrd_utils import (
VERSION as OCRD_VERSION,
MIMETYPE_PAGE,
Expand Down Expand Up @@ -200,10 +201,11 @@ def verify(self):
assert self.output_file_grp is not None
input_file_grps = self.input_file_grp.split(',')
output_file_grps = self.output_file_grp.split(',')
def assert_file_grp_cardinality(grps, spec, msg):
def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
if isinstance(spec, int) and spec > 0:
assert len(grps) == spec, msg % (len(grps), str(spec))
else:
assert isinstance(spec, list)
kba marked this conversation as resolved.
Show resolved Hide resolved
minimum = spec[0]
maximum = spec[1]
if minimum > 0:
Expand Down Expand Up @@ -291,7 +293,7 @@ def process_workspace(self, workspace: Workspace) -> None:
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
# - transient (I/O or OOM) error → maybe sleep, retry
# - persistent (data) error → skip / dummy / raise
input_files = [None] * len(input_file_tuple)
input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple)
for i, input_file in enumerate(input_file_tuple):
if i == 0:
log.info("processing page %s", input_file.pageId)
Expand All @@ -311,7 +313,7 @@ def process_workspace(self, workspace: Workspace) -> None:
# fall back to deprecated method
self.process()

def process_page_file(self, *input_files : OcrdFile) -> None:
def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
"""
Process the given ``input_files`` of the :py:attr:`workspace`,
representing one physical page (passed as one opened
Expand All @@ -323,24 +325,31 @@ def process_page_file(self, *input_files : OcrdFile) -> None:
to handle cases like multiple fileGrps, non-PAGE input etc.)
"""
log = getLogger('ocrd.processor.base')
input_pcgts : List[OcrdPage] = [None] * len(input_files)
input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile))
Comment on lines +329 to +330
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why Optional (also in function prototype)?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here: Because we're instantiating a list of None values, which are not OcrdPage.

In the function signature of process_page_pcgts: Same situation, there might be "holes" in the list of input_pcgts when any of the input_files in process_page_files cannot be parsed as PAGE-XML.

And for process_page_files: The input_files can be hole-y, if the workspace.download_file fails for any of the files (beyond the first?).

But really, I was trying to make sure that static type checking had no more complaints. I tried to add assert statements where I know that variables must be defined or of a certain type to mitigate the "everything might be None" problem somewhat.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, right, I forgot about the holes returned by zip_input_files for multiple fileGrps but incomplete PAGE-XML coverage per page!

Maybe we should document this more loudly.

page_id = input_files[0].pageId
for i, input_file in enumerate(input_files):
assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile))
# FIXME: what about non-PAGE input like image or JSON ???
kba marked this conversation as resolved.
Show resolved Hide resolved
log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
try:
input_pcgts[i] = page_from_file(input_file)
page_ = page_from_file(input_file)
assert isinstance(page_, PcGtsType)
input_pcgts[i] = page_
except ValueError as e:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
except ValueError as e:
except (AssertionError, ValueError) as e:

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this ever happen, ie. can page_from_file(with_etree=False) ever return anything other than a PcGtsType? I think if that was ever the case, we'd want that AssertionError to be raised because then we'd have broken something.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right – it cannot happen. But then what is the assertion good for – satisfying the type checker?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, my curiosity that I understand the behavior correctly. But secondly, yes, the type checker ;)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But reading this again, I should have used OcrdPage not PcGtsType, which is just an alias but we use OcrdPage in the method typing.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Feel free to change in OCR-D#1240.

log.info("non-PAGE input for page %s: %s", page_id, e)
output_file_id = make_file_id(input_files[0], self.output_file_grp)
result = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id)
for image in result.images:
result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
for image_result in result.images:
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
image_result.alternative_image.set_filename(image_file_path)
kba marked this conversation as resolved.
Show resolved Hide resolved
self.workspace.save_image_file(
image.pil,
image.file_id,
image_result.pil,
image_file_id,
self.output_file_grp,
page_id=page_id,
file_path=image.file_path)
file_path=image_file_path)
result.pcgts.set_pcGtsId(output_file_id)
self.add_metadata(result.pcgts)
# FIXME: what about non-PAGE output like JSON ???
Expand All @@ -351,18 +360,21 @@ def process_page_file(self, *input_files : OcrdFile) -> None:
mimetype=MIMETYPE_PAGE,
content=to_xml(result.pcgts))

def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPageResult:
def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
bertsky marked this conversation as resolved.
Show resolved Hide resolved
"""
Process the given ``input_pcgts`` of the :py:attr:`workspace`,
representing one physical page (passed as one parsed
:py:class:`~ocrd_models.OcrdPage` per input fileGrp)
under the given :py:attr:`parameter`, and return the
resulting :py:class:`~ocrd_models.OcrdPage`.

Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage`
and one or more lists or tuples of :py:class:`PIL.Image` (image data),
:py:class:str (file ID) and :py:class:str (file path) of derived images
to be annotated along with the resulting PAGE file.
resulting :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult`.

Optionally, add to the ``images`` attribute of the resulting
:py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResult` instances
of :py:class:`~ocrd.processor.ocrd_page_result.OcrdPageResultImage`,
kba marked this conversation as resolved.
Show resolved Hide resolved
which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
``file_id_suffix`` (used for generating IDs of saved images) and
``file_path`` (the path used in the AlternativeImage and for saving the
file).
kba marked this conversation as resolved.
Show resolved Hide resolved

(This contains the main functionality and must be overridden by subclasses.)
"""
Expand Down
12 changes: 9 additions & 3 deletions src/ocrd/processor/builtin/dummy_processor.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# pylint: disable=missing-module-docstring,invalid-name
from os.path import join, basename
from typing import Optional
from typing import Optional, Union

import click

from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd.processor.ocrd_page_result import OcrdPageResult
from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
from ocrd_models.ocrd_page import OcrdPage, to_xml
kba marked this conversation as resolved.
Show resolved Hide resolved
from ocrd_models.ocrd_page_generateds import PcGtsType
from ocrd_utils import (
getLogger,
make_file_id,
Expand All @@ -25,13 +27,16 @@ class DummyProcessor(Processor):
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
"""

def process_page_pcgts(self, *input_pcgts: OcrdPage, output_file_id: Optional[str] = None, page_id: Optional[str] = None) -> OcrdPageResult:
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
bertsky marked this conversation as resolved.
Show resolved Hide resolved
assert input_pcgts[0]
# nothing to do here
return OcrdPageResult(input_pcgts[0])

def process_page_file(self, *input_files):
def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
LOG = getLogger('ocrd.dummy')
input_file = input_files[0]
assert input_file
assert input_file.local_filename
kba marked this conversation as resolved.
Show resolved Hide resolved
if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE:
# we need to mimic the actual copying in addition to the PAGE boilerplate
file_id = make_file_id(input_file, self.output_file_grp)
Expand All @@ -49,6 +54,7 @@ def process_page_file(self, *input_files):
content=content)
file_id = file_id + '_PAGE'
pcgts = page_from_file(output_file)
assert isinstance(pcgts, PcGtsType)
pcgts = self.process_page_pcgts(pcgts).pcgts
pcgts.set_pcGtsId(file_id)
self.add_metadata(pcgts)
Expand Down
6 changes: 4 additions & 2 deletions src/ocrd/processor/ocrd_page_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from ocrd_models.ocrd_page import OcrdPage
from PIL.Image import Image

from ocrd_models.ocrd_page_generateds import AlternativeImageType

@dataclass
class OcrdPageResultImage():
pil : Image
file_id : str
file_path : str
file_id_suffix : str
alternative_image : AlternativeImageType

@dataclass
class OcrdPageResult():
Expand Down