diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..f95a09a4 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,33 @@ +name: Test ocrd_cis installation and run tests + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + os: [ "ubuntu-22.04" ] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '11' + - name: Install ocrd_cis + run: make install + - name: Test ocrd_cis + run: make test V="" diff --git a/Makefile b/Makefile index a040cf9d..d1991df0 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,17 @@ docker-push: docker-build TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) .PHONY: $(TEST_SCRIPTS) $(TEST_SCRIPTS): - bash $@ $V + OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V + OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V + +test: export OCRD_OVERRIDE_LOGLEVEL=DEBUG +test: export OCRD_MISSING_OUTPUT=ABORT +test: export OCRD_MAX_MISSING_OUTPUTS=-1 test: $(TEST_SCRIPTS) - @echo $^ + @echo =====single-threaded test results===== + @cat test_serially.log + @echo =====4-page-parallel test results===== + @cat test_parallel.log + @$(RM) test_serially.log test_parallel.log + .PHONY: install install-devel uninstall test docker-build docker-push diff --git a/ocrd_cis/__init__.py b/ocrd_cis/__init__.py index 6f37f4f7..9d22fe3e 100644 --- a/ocrd_cis/__init__.py +++ b/ocrd_cis/__init__.py @@ -1,3 +1,2 @@ from .javaprocess import JavaAligner from .javaprocess import JavaPostCorrector -from .ocrd_tool import get_ocrd_tool diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index ffe53fd8..395f7b07 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -1,150 +1,123 @@ from __future__ import absolute_import +from __future__ import annotations + import click import json import os -import Levenshtein -from ocrd import Processor +from typing import Optional, List, Dict, Type + +from rapidfuzz.distance import Levenshtein + +from ocrd import Processor, OcrdPage, OcrdPageResult from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import MIMETYPE_PAGE -from ocrd_utils import getLogger from ocrd_utils import getLevelName -from ocrd_utils import make_file_id -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import to_xml -from ocrd_models.ocrd_page_generateds import TextEquivType +from ocrd_models.ocrd_page import TextRegionType, TextEquivType from ocrd_cis import JavaAligner -from ocrd_cis import get_ocrd_tool + @click.command() @ocrd_cli_options def ocrd_cis_align(*args, **kwargs): - return ocrd_cli_wrap_processor(Aligner, *args, **kwargs) - -class Aligner(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-align'] - kwargs['version'] = ocrd_tool['version'] - super(Aligner, self).__init__(*args, **kwargs) + return ocrd_cli_wrap_processor(CISAligner, *args, **kwargs) - if hasattr(self, 'workspace'): - self.log = getLogger('cis.Processor.Aligner') +class CISAligner(Processor): + @property + def executable(self): + return 'ocrd-cis-align' - def process(self): - ifgs = self.input_file_grp.split(",") # input file groups - if len(ifgs) < 2: - raise Exception("need at least two input file groups to align") - ifts = self.zip_input_files(ifgs) # input file tuples - for _id, ift in enumerate(ifts): - alignments = json.loads(self.run_java_aligner(ift)) - pcgts = self.align(alignments, ift) - # keep the right part after OCR-D-...-filename - # and prepend output_file_grp - input_file = ift[0].input_file - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts), - ) - self.log.info('created file %s', out) + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + assert len(input_pcgts) >= 2 + alignments = json.loads(self.run_java_aligner(input_pcgts)) + pcgts = self.align(alignments, input_pcgts) + return OcrdPageResult(pcgts) - def align(self, alignments, ift): + def align(self, alignments: List[Dict], pcgts: List[OcrdPage]) -> OcrdPage: """align the alignment objects with the according input file tuples""" - for t in ift: - self.log.debug("tuple %s", os.path.basename(t.input_file.url)) - pcgtst = self.open_input_file_tuples(ift) i = 0 - for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()): + file_groups = self.input_file_grp.split(',') + for mi, mr in enumerate(pcgts[0].get_Page().get_AllRegions(classes=['Text'])): for mj, _ in enumerate(mr.get_TextLine()): - for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()): - self.log.debug("[%d] %s", iiii, u.Unicode) - for xx in mr.get_TextLine()[mj].get_Word(): - for iiii, u in enumerate(xx.get_TextEquiv()): - self.log.debug("[%d] %s", iiii, u.Unicode) - lines = [] - for ii, t in enumerate(ift): + for ii, page in enumerate(pcgts): if i >= len(alignments): break - tr = pcgtst[ii].get_Page().get_TextRegion() + tr = page.get_Page().get_AllRegions(classes=['Text']) region = tr[mi].get_TextLine()[mj] - lines.append(Alignment(t, region, alignments[i])) + lines.append(Alignment(file_groups[ii], page, region, alignments[i])) self.align_lines(lines) i += 1 - return pcgtst[0] + return pcgts[0] - def align_lines(self, lines): + def align_lines(self, lines: List[Alignment]) -> None: """align the given line alignment with the lines""" if not lines: return - if len(lines[0].region.get_TextEquiv()) > 1: - del lines[0].region.get_TextEquiv()[1:] + if len(lines[0].region.TextEquiv) > 1: + del lines[0].region.TextEquiv[1:] for i, line in enumerate(lines): if lines[0].region.get_TextEquiv() is None: lines[0].region.TextEquiv = [] - self.log.debug('line alignment: %s [%s - %s]', - get_textequiv_unicode(line.region), - line.region.get_id(), - line.input_file.input_file_group) - ddt = line.input_file.input_file_group + "/" + line.region.get_id() - if i != 0: + self.logger.debug( + 'line alignment: %s [%s - %s]', + get_textequiv_unicode(line.region), + line.region.get_id(), + line.file_grp + ) + ddt = line.file_grp + "/" + line.region.get_id() + if i > 0: te = TextEquivType( Unicode=get_textequiv_unicode(line.region), conf=get_textequiv_conf(line.region), dataType="other", - dataTypeDetails="ocrd-cis-line-alignment:" + ddt) + dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}") lines[0].region.add_TextEquiv(te) else: - self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i) - lines[0].region.get_TextEquiv()[i].set_dataType("other") - lines[0].region.get_TextEquiv()[i].set_dataTypeDetails( + self.logger.debug("len: %i, i: %i", len(lines[0].region.TextEquiv), i) + lines[0].region.TextEquiv[i].set_dataType("other") + lines[0].region.TextEquiv[i].set_dataTypeDetails( "ocrd-cis-line-alignment-master-ocr:" + ddt) - lines[0].region.get_TextEquiv()[i].set_index(i+1) + lines[0].region.TextEquiv[i].set_index(i+1) self.align_words(lines) - def align_words(self, lines): - # self.log.info(json.dumps(lines[0].alignment)) + def align_words(self, lines: List[Alignment]) -> None: + # self.logger.info(json.dumps(lines[0].alignment)) mregion = lines[0].region.get_Word() oregion = [lines[i].region.get_Word() for i in range(1, len(lines))] for word in lines[0].alignment['wordAlignments']: - self.log.debug("aligning word %s", word['master']) + self.logger.debug("aligning word %s", word['master']) master, rest = self.find_word([word['master']], mregion, "master") mregion = rest if master is None or len(master) != 1: - self.log.warn("cannot find {}; giving up".format(word['master'])) + self.logger.warn("cannot find {}; giving up".format(word['master'])) # raise Exception("cannot find {}; giving up".format(word['master'])) return others = list() for i, other in enumerate(word['alignments']): match, rest = self.find_word(other, oregion[i]) if match is None: - self.log.warn("cannot find {}; giving up".format(other)) + self.logger.warn(f"cannot find {other}; giving up") return others.append(match) oregion[i] = rest words = list() words.append( - Alignment(lines[0].input_file, master, lines[0].alignment)) + Alignment(lines[0].file_grp, lines[0].pcgts, master, lines[0].alignment)) for i, other in enumerate(others): words.append(Alignment( - lines[i+1].input_file, + lines[i+1].file_grp, + lines[i+1].pcgts, other, lines[i+1].alignment)) self.align_word_regions(words) - def align_word_regions(self, words): + def align_word_regions(self, words: List[Alignment]) -> None: def te0(x): - return x.get_TextEquiv()[0] + return x.TextEquiv[0] for i, word in enumerate(words): if not word.region: - ifg = word.input_file.input_file_group - self.log.debug("(empty) word alignment: [%s]", ifg) + ifg = word.file_grp + self.logger.debug("(empty) word alignment: [%s]", ifg) te = TextEquivType( dataType="other", dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg) @@ -153,50 +126,42 @@ def te0(x): continue _str = " ".join([te0(x).Unicode for x in word.region]) _id = ",".join([x.get_id() for x in word.region]) - ifg = word.input_file.input_file_group - ddt = word.input_file.input_file_group + "/" + _id + ifg = word.file_grp + ddt = word.file_grp + "/" + _id # if conf is none it is most likely ground truth data conf = min([float(te0(x).get_conf() or "1.0") for x in word.region]) - self.log.debug("word alignment: %s [%s - %s]", _str, _id, ifg) + self.logger.debug(f"word alignment: {_str} [{_id} - {ifg}]") if i != 0: te = TextEquivType( - Unicode=_str, - conf=conf, - dataType="other", - dataTypeDetails="ocrd-cis-word-alignment:" + ddt) + Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}") words[0].region[0].add_TextEquiv(te) else: words[0].region[0].get_TextEquiv()[i].set_dataType("other") - words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails( - "ocrd-cis-word-alignment-master-ocr:" + ddt) + words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails(f"ocrd-cis-word-alignment-master-ocr:{ddt}") words[0].region[0].get_TextEquiv()[i].set_index(i+1) def find_word(self, tokens, regions, t="other"): - self.log.debug("tokens = %s [%s]", tokens, t) + tokens_str = f"tokens = {tokens} [{t}]" + self.logger.debug(tokens_str) for i, _ in enumerate(regions): n = self.match_tokens(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again with levenshtein - self.log.warn( - "could not find tokens = %s [%s]; trying again", - tokens, t) + self.logger.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_lev(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again to match token within another one - self.log.warn( - "could not find tokens = %s [%s]; trying again", - tokens, t) + self.logger.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_within(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) - # nothing could be found return tuple([None, regions]) @@ -212,7 +177,7 @@ def match_tokens_lev(self, tokens, regions, i): def f(a, b): k = 3 # int(len(a)/3) d = Levenshtein.distance(a, b) - self.log.debug("lev %s <=> %s: %d (%d)", a, b, d, d) + self.logger.debug(f"lev {a} <=> {b}: {d} ({d})") return d <= 1 or d <= k return self.match_tokens_lambda(tokens, regions, i, f) @@ -227,14 +192,15 @@ def match_tokens_lambda(self, tokens, regions, i, f): Returns 0 if nothing could be matched. """ for j, token in enumerate(tokens): - if j + i >= len(regions): + sum_i_j = j + i + if sum_i_j >= len(regions): return 0 - if not regions[i+j].get_TextEquiv()[0].Unicode: - self.log.warn("cannot find %s", token) + unicode = regions[sum_i_j].TextEquiv[0].Unicode + if not unicode: + self.logger.warn(f"cannot find {token}") return 0 - self.log.debug('checking %s with %s', token, - regions[i+j].get_TextEquiv()[0].Unicode) - if f(token, regions[i+j].get_TextEquiv()[0].Unicode): + self.logger.debug(f'checking {token} with {unicode}') + if f(token, unicode): continue if j == 0: return 0 @@ -244,69 +210,29 @@ def match_tokens_lambda(self, tokens, regions, i, f): i += 1 return i + len(tokens) - def open_input_file_tuples(self, ift): - """ - opens all xml files of the given input file tuple - and returns them as tuples - """ - res = list() - for ifile in ift: - pcgts = ifile.open() - res.append(pcgts) - return tuple(res) - - def zip_input_files(self, ifgs): - """Zip files of the given input file groups""" - files = list() - for ifg in ifgs: - self.log.info("input file group: %s", ifg) - ifiles = sorted( - self.workspace.mets.find_files(fileGrp=ifg), - key=lambda ifile: ifile.url) - for i in ifiles: - self.log.debug("sorted file: %s %s", - os.path.basename(i.url), i.ID) - ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles] - files.append(ifiles) - return zip(*files) - - def read_lines_from_input_file(self, ifile): - self.log.info("reading input file: %s", ifile) + def run_java_aligner(self, input_pcgts: List[OcrdPage]) -> str: lines = list() - pcgts = ifile.open() - for region in pcgts.get_Page().get_TextRegion(): - for line in region.get_TextLine(): - lines.append(get_textequiv_unicode(line)) - return lines - - def run_java_aligner(self, ifs): - lines = list() - for ifile in ifs: - lines.append(self.read_lines_from_input_file(ifile)) + for pcgts in input_pcgts: + lines.append([get_textequiv_unicode(line) + for line in pcgts.get_Page().get_AllTextLines()]) + # JavaAligner expects a strange input format lines = zip(*lines) _input = [x.strip() for t in lines for x in t] for i in _input: - self.log.debug("input line: %s", i) - n = len(ifs) - self.log.debug("starting java client") - p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel())) + self.logger.debug("input line: %s", i) + n = len(input_pcgts) + self.logger.debug("starting java client") + p = JavaAligner(n, getLevelName(self.logger.getEffectiveLevel())) return p.run("\n".join(_input)) -class FileAlignment: - def __init__(self, workspace, ifile, ifg): - self.workspace = workspace - self.input_file = ifile - self.input_file_group = ifg - self.log = getLogger('cis.FileAlignment') - - def open(self): - self.log.info("opening: %s", os.path.basename(self.input_file.url)) - return page_from_file(self.workspace.download_file(self.input_file)) - - class Alignment: - def __init__(self, ifile, region, alignment): - self.input_file = ifile + file_grp: str + pcgts: OcrdPage + region: TextRegionType + alignment: dict + def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: dict): + self.file_grp = file_grp + self.pcgts = pcgts self.region = region self.alignment = alignment diff --git a/ocrd_cis/data/__main__.py b/ocrd_cis/data/__main__.py index 3d8ef735..8fdcddd6 100644 --- a/ocrd_cis/data/__main__.py +++ b/ocrd_cis/data/__main__.py @@ -1,18 +1,18 @@ -import pkg_resources import sys +from ocrd_utils import resource_filename def main(): usage = 'usage: ' + sys.argv[0] + ' -jar|-3gs|-model|-config' if '-h' in sys.argv: print(usage) elif '-jar' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) + print(resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) elif '-3gs' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/3gs.csv.gz')) + print(resource_filename('ocrd_cis', 'data/3gs.csv.gz')) elif '-model' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/model.zip')) + print(resource_filename('ocrd_cis', 'data/model.zip')) elif '-config' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/config.json')) + print(resource_filename('ocrd_cis', 'data/config.json')) else: raise ValueError(usage) diff --git a/ocrd_cis/div/eval.py b/ocrd_cis/div/eval.py index 6efe90c6..f47682ff 100644 --- a/ocrd_cis/div/eval.py +++ b/ocrd_cis/div/eval.py @@ -1,6 +1,6 @@ import os from PIL import Image -from Levenshtein import distance +from rapidfuzz.distance.Levenshtein import distance path = '/mnt/c/Users/chris/Documents/projects/OCR-D/daten/gt/lines/' diff --git a/ocrd_cis/div/stats.py b/ocrd_cis/div/stats.py index ea385d98..6f9c9816 100644 --- a/ocrd_cis/div/stats.py +++ b/ocrd_cis/div/stats.py @@ -4,7 +4,7 @@ from ocrd import Processor from ocrd_cis import get_ocrd_tool from ocrd_models.ocrd_page_generateds import parse -from Levenshtein import distance +from rapidfuzz.distance import Levenshtein class Stats(Processor): @@ -81,7 +81,7 @@ def process(self): # print(line.get_TextEquiv()[2].dataType) unicodeline = line.get_TextEquiv()[i].Unicode - d[i] += distance(gtline, unicodeline) + d[i] += Levenshtein.distance(gtline, unicodeline) # words = line.get_Word() # for word in words: diff --git a/ocrd_cis/javaprocess.py b/ocrd_cis/javaprocess.py index ce2f6bfd..72915d68 100644 --- a/ocrd_cis/javaprocess.py +++ b/ocrd_cis/javaprocess.py @@ -1,12 +1,11 @@ import subprocess import json -import pkg_resources -from ocrd_utils import getLogger +from ocrd_utils import getLogger, resource_filename from pathlib import Path MAIN = "de.lmu.cis.ocrd.cli.Main" -JAR = pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar') +JAR = str(resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) def JavaAligner(n, loglvl): """Create a java process that calls -c align -D '{"n":n}'""" diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index a93917da..c2e20268 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -12,17 +12,9 @@ "preprocessing/optimization/grayscale_normalization", "preprocessing/optimization/deskewing" ], - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-BIN", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with Ocropy v1", "parameters": { "method": { "type": "string", @@ -75,15 +67,9 @@ "steps": [ "preprocessing/optimization/deskewing" ], - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Deskew regions with ocropy (by annotating orientation angle and adding AlternativeImage)", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Deskew regions with Ocropy v1 (by annotating orientation angle and adding AlternativeImage)", "parameters": { "maxskew": { "type": "number", @@ -106,17 +92,9 @@ "steps": [ "preprocessing/optimization/despeckling" ], - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-DESPECK", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Despeckle pages / regions / lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Despeckle pages / regions / lines with Ocropy v1", "parameters": { "noise_maxsize": { "type": "number", @@ -147,14 +125,8 @@ "layout/segmentation/region", "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "description": "Clip text regions / lines at intersections with neighbours", "parameters": { "level-of-operation": { @@ -185,12 +157,8 @@ "steps": [ "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "description": "Improve coordinates of text lines", "parameters": { "level-of-operation": { @@ -245,12 +213,8 @@ "preprocessing/optimization/dewarping" ], "description": "Dewarp line images with ocropy", - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "dpi": { "type": "number", @@ -286,15 +250,9 @@ "steps": [ "recognition/text-recognition" ], - "description": "Recognize text in (binarized+deskewed+dewarped) lines with ocropy", - "input_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-SEG-WORD", - "OCR-D-SEG-GLYPH" - ], - "output_file_grp": [ - "OCR-D-OCR-OCRO" - ], + "description": "Recognize text in (binarized+deskewed+dewarped) lines with Ocropy v1", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "textequiv_level": { "type": "string", @@ -345,14 +303,9 @@ "layout/segmentation/region", "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], - "description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with Ocropy v1", "parameters": { "dpi": { "type": "number", @@ -444,11 +397,9 @@ "steps": [ "recognition/text-recognition" ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "description": "train model with ground truth from mets data", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "train Ocropy v1 text recognition model with PAGE ground truth from the input fileGrp extracted as file pairs into the output fileGrp", "parameters": { "textequiv_level": { "type": "string", @@ -470,7 +421,8 @@ }, "outputpath": { "type": "string", - "description": "(existing) path for the trained model" + "default": "output", + "description": "directory path for the trained model" } } }, @@ -482,15 +434,9 @@ "steps": [ "recognition/post-correction" ], - "input_file_grp": [ - "OCR-D-OCR-1", - "OCR-D-OCR-2", - "OCR-D-OCR-N" - ], - "output_file_grp": [ - "OCR-D-ALIGNED" - ], - "description": "Align multiple OCRs and/or GTs" + "input_file_grp_cardinality": [2, -1], + "output_file_grp_cardinality": 1, + "description": "Align multiple OCRs and/or GTs textually on line/word level" }, "ocrd-cis-postcorrect": { "executable": "ocrd-cis-postcorrect", @@ -501,12 +447,8 @@ "recognition/post-correction" ], "description": "Post correct OCR results", - "input_file_grp": [ - "OCR-D-LINE-ALIGNED" - ], - "output_file_grp": [ - "OCR-D-POST-CORRECTED" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "maxCandidates": { "description": "Maximum number of considered correction candidates per suspicious token", diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py deleted file mode 100644 index 36cb9d7e..00000000 --- a/ocrd_cis/ocrd_tool.py +++ /dev/null @@ -1,6 +0,0 @@ -import json -from ocrd_utils import resource_string - - -def get_ocrd_tool(): - return json.loads(resource_string(__name__, 'ocrd-tool.json')) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 872185c3..9a55301d 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,38 +1,21 @@ from __future__ import absolute_import +from logging import Logger +from typing import Optional -import os.path import cv2 import numpy as np from PIL import Image -#import kraken.binarization +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) -from ocrd import Processor - -from .. import get_ocrd_tool from . import common -from .common import ( - pil2array, array2pil, - # binarize, - remove_noise) - -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) +from .common import array2pil, determine_zoom, pil2array, remove_noise -TOOL = 'ocrd-cis-ocropy-binarize' -def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): - LOG = getLogger('processor.OcropyBinarize') - LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) +def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): + logger.debug(f'Binarizing {pil_image.width}x{pil_image.height} image with method={method}') if method == 'none': # useful if the images are already binary, # but lack image attribute `binarized` @@ -54,42 +37,33 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo if method == 'global': # global thresholding - _, th = cv2.threshold(img,threshold*255,255,cv2.THRESH_BINARY) + _, th = cv2.threshold(img, threshold * 255, 255, cv2.THRESH_BINARY) elif method == 'otsu': # Otsu's thresholding - _, th = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, th = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) elif method == 'gauss-otsu': # Otsu's thresholding after Gaussian filtering blur = cv2.GaussianBlur(img, (5, 5), 0) - _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) else: raise Exception('unknown binarization method %s' % method) return Image.fromarray(th), 0 - class OcropyBinarize(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-binarize' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyBinarize, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() - def setup(self): - self.logger = getLogger('processor.OcropyBinarize') - if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy': - self.logger.critical('requested method %s does not support grayscale normalized output', - self.parameter['method']) - raise Exception('only method=ocropy allows grayscale=true') + method = self.parameter['method'] + if self.parameter['grayscale'] and method != 'ocropy': + self.logger.critical(f'Requested method {method} does not support grayscale normalized output') + raise ValueError('only method=ocropy allows grayscale=true') - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested + Iterate over the PAGE-XML element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout @@ -105,80 +79,61 @@ def process(self): Reference each new image in the AlternativeImage of the element. - Produce a new output file by serialising the resulting hierarchy. + Return a PAGE-XML with new AlternativeImage(s) and the arguments + for ``workspace.save_image_file``. """ level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) + assert self.workspace + self.logger.debug(f'Level of operation: "{level}"') - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) + pcgts = input_pcgts[0] + assert pcgts + page = pcgts.get_Page() + assert page - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, feature_filter='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - if level == 'page': - self.process_page(page, page_image, page_xywh, zoom, - input_file.pageId, file_id) - else: - if level == 'table': - regions = page.get_TableRegion() - else: # region - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, feature_filter='binarized') - if level == 'region': - self.process_region(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue - lines = region.get_TextLine() - if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', - page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, feature_filter='binarized') - self.process_line(line, line_image, line_xywh, zoom, - input_file.pageId, region.id, - file_id + '_' + region.id + '_' + line.id) + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, feature_filter='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + result = OcrdPageResult(pcgts) + if level == 'page': + try: + result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id)) + except ValueError as e: + self.logger.error(e) + else: + if level == 'table': + regions = page.get_TableRegion() + else: # region + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning(f"Page '{page_id}' contains no regions") + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_filter='binarized') + if level == 'region': + try: + result.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id)) + continue + except ValueError as e: + self.logger.error(e) + lines = region.get_TextLine() + if not lines: + self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines") + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, feature_filter='binarized') + try: + result.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id)) + except ValueError as e: + self.logger.error(e) + return result - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): + def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage: if not page_image.width or not page_image.height: - self.logger.warning("Skipping page '%s' with zero size", page_id) - return - self.logger.info("About to binarize page '%s'", page_id) + raise ValueError(f"Skipping page '{page_id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}'") + features = page_xywh['features'] if 'angle' in page_xywh and page_xywh['angle']: # orientation has already been annotated (by previous deskewing), @@ -186,65 +141,64 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): maxskew = 0 else: maxskew = self.parameter['maxskew'] - bin_image, angle = binarize(page_image, - method=self.parameter['method'], - maxskew=maxskew, - threshold=self.parameter['threshold'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + self.logger, + page_image, + method=self.parameter['method'], + maxskew=maxskew, + threshold=self.parameter['threshold'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' page_xywh['angle'] = angle if self.parameter['noise_maxsize']: - bin_image = remove_noise( - bin_image, maxsize=self.parameter['noise_maxsize']) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) features += ',despeckled' # annotate angle in PAGE (to allow consumers of the AlternativeImage # to do consistent coordinate transforms, and non-consumers # to redo the rotation themselves): orientation = -page_xywh['angle'] - orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] + orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) - # update METS (add the image file): if self.parameter['grayscale']: - file_id += '.IMG-NRM' + suffix = '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + suffix = '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + alt_image = AlternativeImageType(comments=features) + page.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) - def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id): + def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage: if not region_image.width or not region_image.height: - self.logger.warning("Skipping region '%s' with zero size", region.id) - return - self.logger.info("About to binarize page '%s' region '%s'", page_id, region.id) + raise ValueError(f"Skipping region '{region.id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'") features = region_xywh['features'] if 'angle' in region_xywh and region_xywh['angle']: # orientation has already been annotated (by previous deskewing), # so skip deskewing here: - bin_image, _ = binarize(region_image, - method=self.parameter['method'], - maxskew=0, - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, _ = binarize( + self.logger, + region_image, + method=self.parameter['method'], + maxskew=0, + nrm=self.parameter['grayscale'], + zoom=zoom) else: - bin_image, angle = binarize(region_image, - method=self.parameter['method'], - maxskew=self.parameter['maxskew'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + self.logger, + region_image, + method=self.parameter['method'], + maxskew=self.parameter['maxskew'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' region_xywh['angle'] = angle - bin_image = remove_noise(bin_image, - maxsize=self.parameter['noise_maxsize']) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' # annotate angle in PAGE (to allow consumers of the AlternativeImage @@ -253,33 +207,30 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ orientation = -region_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] region.set_orientation(orientation) - # update METS (add the image file): + suffix = f'{region.id}' if self.parameter['grayscale']: - file_id += '.IMG-NRM' + suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + suffix += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) # update PAGE (reference the image file): - region.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + alt_image = AlternativeImageType(comments=features) + region.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) - def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, file_id): + def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage: if not line_image.width or not line_image.height: - self.logger.warning("Skipping line '%s' with zero size", line.id) - return - self.logger.info("About to binarize page '%s' region '%s' line '%s'", - page_id, region_id, line.id) + raise ValueError(f"Skipping line '{line.id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") features = line_xywh['features'] - bin_image, angle = binarize(line_image, - method=self.parameter['method'], - maxskew=self.parameter['maxskew'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + self.logger, + line_image, + method=self.parameter['method'], + maxskew=self.parameter['maxskew'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' # annotate angle in PAGE (to allow consumers of the AlternativeImage @@ -288,23 +239,19 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi #orientation = -angle #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] #line.set_orientation(orientation) # does not exist on line level! - self.logger.warning("cannot add orientation %.2f to page '%s' region '%s' line '%s'", - -angle, page_id, region_id, line.id) - bin_image = remove_noise(bin_image, - maxsize=self.parameter['noise_maxsize']) + self.logger.warning( + f"Cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", -angle) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' - # update METS (add the image file): + suffix = f'{region_id}_{line.id}' if self.parameter['grayscale']: - file_id += '.IMG-NRM' + suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + suffix += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) # update PAGE (reference the image file): - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + alt_image = AlternativeImageType(comments=features) + line.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index a305f09e..18a0c115 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,50 +1,36 @@ from __future__ import absolute_import +from logging import Logger +from typing import Optional -import os.path import numpy as np from PIL import Image, ImageStat, ImageOps from shapely.geometry import Polygon from shapely.prepared import prep -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) -from ocrd import Processor +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - coordinates_of_segment, - polygon_from_points, bbox_from_polygon, + coordinates_of_segment, + crop_image, image_from_polygon, + polygon_from_points, polygon_mask, - crop_image, - MIMETYPE_PAGE ) -from .. import get_ocrd_tool +from .common import array2pil, determine_zoom, pil2array from .ocrolib import midrange, morph -from .common import ( - # binarize, - pil2array, array2pil -) -TOOL = 'ocrd-cis-ocropy-clip' class OcropyClip(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-clip' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyClip, self).__init__(*args, **kwargs) + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: + """Clip text regions / lines of a page at intersections with neighbours. - def process(self): - """Clip text regions / lines of the workspace at intersections with neighbours. - - Open and deserialise PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``level-of-operation``. @@ -64,7 +50,7 @@ def process(self): Reference each new image in the AlternativeImage of the element. - Produce a new output file by serialising the resulting hierarchy. + Return the resulting OcrdPage. """ # This makes best sense for overlapping segmentation, like current GT # or Tesseract layout analysis. Most notably, it can suppress graphics @@ -74,39 +60,26 @@ def process(self): # connected component analysis after implicit binarization could be # suboptimal, and the explicit binarization after clipping could be, # too. However, region-level clipping _must_ be run before region-level - # deskewing, because that would make segments incomensurable with their + # deskewing, because that would make segments incommensurable with their # neighbours. - LOG = getLogger('processor.OcropyClip') level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - # FIXME: what about text regions inside table regions? - regions = list(page.get_TextRegion()) - num_texts = len(regions) - regions += ( + assert self.workspace + self.logger.debug(f'Level of operation: "{level}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + # The zoom is not used anywhere + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + ret = OcrdPageResult(pcgts) + + # FIXME: what about text regions inside table regions? + regions = list(page.get_TextRegion()) + num_texts = len(regions) + regions += ( page.get_AdvertRegion() + page.get_ChartRegion() + page.get_ChemRegion() + @@ -119,147 +92,121 @@ def process(self): page.get_SeparatorRegion() + page.get_TableRegion() + page.get_UnknownRegion()) - if not num_texts: - LOG.warning('Page "%s" contains no text regions', page_id) - background = ImageStat.Stat(page_image) - # workaround for Pillow#4925 - if len(background.bands) > 1: - background = tuple(background.median) - else: - background = background.median[0] + if not num_texts: + self.logger.warning(f'Page "{page_id}" contains no text regions') + background = ImageStat.Stat(page_image) + # workaround for Pillow#4925 + if len(background.bands) > 1: + background = tuple(background.median) + else: + background = background.median[0] + if level == 'region': + background_image = Image.new(page_image.mode, page_image.size, background) + page_array = pil2array(page_image) + page_bin = np.array(page_array <= midrange(page_array), np.uint8) + # in absolute coordinates merely for comparison/intersection + shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] + # in relative coordinates for mask/cropping + polygons = [coordinates_of_segment(region, page_image, page_xywh) for region in regions] + for i, polygon in enumerate(polygons[num_texts:], num_texts): + # for non-text regions, extend mask by 3 pixels in each direction + # to ensure they do not leak components accidentally + # (accounts for bad cropping of such regions in GT): + polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open + polygons[i] = polygon + masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] + for i, region in enumerate(regions): + if i >= num_texts: + break # keep non-text regions unchanged if level == 'region': - background_image = Image.new(page_image.mode, page_image.size, background) - page_array = pil2array(page_image) - page_bin = np.array(page_array <= midrange(page_array), np.uint8) - # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(region.get_Coords().points)) - for region in regions] - # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(region, page_image, page_coords) - for region in regions] - for i, polygon in enumerate(polygons[num_texts:], num_texts): - # for non-text regions, extend mask by 3 pixels in each direction - # to ensure they do not leak components accidentally - # (accounts for bad cropping of such regions in GT): - polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open - polygons[i] = polygon - masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) - for polygon in polygons] - for i, region in enumerate(regions): - if i >= num_texts: - break # keep non-text regions unchanged - if level == 'region': - if region.get_AlternativeImage(): - # FIXME: This should probably be an exception (bad workflow configuration). - LOG.warning('Page "%s" region "%s" already contains image data: skipping', - page_id, region.id) - continue - shape = prep(shapes[i]) - neighbours = [(regionj, maskj) for shapej, regionj, maskj - in zip(shapes[:i] + shapes[i+1:], - regions[:i] + regions[i+1:], - masks[:i] + masks[i+1:]) - if shape.intersects(shapej)] - if neighbours: - self.process_segment(region, masks[i], polygons[i], - neighbours, background_image, - page_image, page_coords, page_bin, - input_file.pageId, file_id + '_' + region.id) + if region.get_AlternativeImage(): + # FIXME: This should probably be an exception (bad workflow configuration). + self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping') continue - # level == 'line': - lines = region.get_TextLine() - if not lines: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + shape = prep(shapes[i]) + neighbours = [ + (regionj, maskj) for shapej, regionj, maskj in + zip(shapes[:i] + shapes[i + 1:], regions[:i] + regions[i + 1:], masks[:i] + masks[i + 1:]) + if shape.intersects(shapej)] + if neighbours: + ret.images.append(self.process_segment( + region, masks[i], polygons[i], neighbours, background_image, + page_image, page_xywh, page_bin, page_id)) + continue + # level == 'line': + lines = region.get_TextLine() + if not lines: + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') + continue + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_selector='binarized') + background_image = Image.new(region_image.mode, region_image.size, background) + region_array = pil2array(region_image) + region_bin = np.array(region_array <= midrange(region_array), np.uint8) + # in absolute coordinates merely for comparison/intersection + shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] + # in relative coordinates for mask/cropping + polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] + masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] + for j, line in enumerate(lines): + if line.get_AlternativeImage(): + # FIXME: This should probably be an exception (bad workflow configuration). + self.logger.warning( + f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping') continue - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - background_image = Image.new(region_image.mode, region_image.size, background) - region_array = pil2array(region_image) - region_bin = np.array(region_array <= midrange(region_array), np.uint8) - # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(line.get_Coords().points)) - for line in lines] - # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(line, region_image, region_coords) - for line in lines] - masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) - for polygon in polygons] - for j, line in enumerate(lines): - if line.get_AlternativeImage(): - # FIXME: This should probably be an exception (bad workflow configuration). - LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', - page_id, region.id, line.id) - continue - shape = prep(shapes[j]) - neighbours = [(linej, maskj) for shapej, linej, maskj - in zip(shapes[:j] + shapes[j+1:], - lines[:j] + lines[j+1:], - masks[:j] + masks[j+1:]) - if shape.intersects(shapej)] - if neighbours: - self.process_segment(line, masks[j], polygons[j], - neighbours, background_image, - region_image, region_coords, region_bin, - input_file.pageId, file_id + '_' + region.id + '_' + line.id) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def process_segment(self, segment, segment_mask, segment_polygon, neighbours, - background_image, parent_image, parent_coords, parent_bin, - page_id, file_id): - LOG = getLogger('processor.OcropyClip') + shape = prep(shapes[j]) + neighbours = [ + (linej, maskj) for shapej, linej, maskj in + zip(shapes[:j] + shapes[j + 1:], lines[:j] + lines[j + 1:], masks[:j] + masks[j + 1:]) + if shape.intersects(shapej)] + if neighbours: + ret.images.append(self.process_segment( + line, masks[j], polygons[j], neighbours, background_image, + region_image, region_coords, region_bin, page_id)) + return ret + + def process_segment( + self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, + parent_bin, page_id + ) -> OcrdPageResultImage: # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( [feature for feature in parent_coords['features'].split(',') - if feature in ['binarized', 'grayscale_normalized', - 'despeckled', 'dewarped']]) + ',clipped' + if feature in ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped']]) + ',clipped' # mask segment within parent image: segment_image = image_from_polygon(parent_image, segment_polygon) segment_bbox = bbox_from_polygon(segment_polygon) for neighbour, neighbour_mask in neighbours: if not np.any(segment_mask > neighbour_mask): - LOG.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', - neighbour.id, segment.id, page_id) + self.logger.info( + f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"') continue # find connected components that (only) belong to the neighbour: - intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour - intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively + intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour + intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively num_intruders = np.count_nonzero(intruders) num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: continue - LOG.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', - segment.id, neighbour.id, num_intruders, num_foreground, page_id) + self.logger.debug( + f'segment "{segment.id}" vs neighbour "{neighbour.id}": suppressing {num_intruders} of ' + f'{num_foreground} pixels on page "{page_id}"') # suppress in segment_mask so these intruders can stay in the neighbours # (are not removed from both sides) segment_mask -= intruders # suppress in derived image result to be annotated clip_mask = array2pil(intruders) - segment_image.paste(background_image, mask=clip_mask) # suppress in raw image + segment_image.paste(background_image, mask=clip_mask) # suppress in raw image if segment_image.mode in ['RGB', 'L', 'RGBA', 'LA']: # for consumers that do not have to rely on our # guessed background color, but can cope with transparency: segment_image.putalpha(ImageOps.invert(clip_mask)) # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): - segment_image = crop_image(segment_image,box=segment_bbox) - # update METS (add the image file): - file_path = self.workspace.save_image_file( - segment_image, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) + segment_image = crop_image(segment_image, box=segment_bbox) # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + suffix = f'{segment.id}.IMG_CLIP' + alternative_image = AlternativeImageType(comments=features) + segment.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(segment_image, suffix, alternative_image) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 3cb9e4c4..bae4dac0 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from typing import Optional import warnings import logging @@ -10,7 +11,7 @@ from skimage.morphology import medial_axis import networkx as nx from PIL import Image - +from ocrd_models import OcrdExif from . import ocrolib from .ocrolib import morph, psegutils, sl # for decorators (type-checks etc): @@ -643,7 +644,7 @@ def compute_seplines(binary, scale, maxseps=0): sepdists.append(np.median(subdistances)) #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice)) sepsizes = np.array(sepsizes) - sepslices = np.array(sepslices) + sepslices = np.array(sepslices, dtype=object) LOG.debug("detected %d separator candidates", numsep) DSAVE("seps-raw", sepmap[labels]) # now dilate+erode to link neighbouring candidates, @@ -2102,3 +2103,16 @@ def find_topological(): # rlabels[region_hull] = region # DSAVE('rlabels_closed', rlabels) return rlabels + +def determine_zoom(logger: logging.Logger, page_id: Optional[str], dpi: float, page_image_info: OcrdExif) -> float: + if dpi > 0: + zoom = 300.0/dpi + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi *= 2.54 + logger.info(f"Page '{page_id}' uses {dpi} DPI.") + zoom = 300.0/dpi + else: + zoom = 1 + return zoom diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index cbbdf8cf..eaed74df 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,38 +1,22 @@ from __future__ import absolute_import +from typing import Optional +from logging import Logger -import os.path +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) -from ocrd import Processor - -from .. import get_ocrd_tool -from .common import ( - # binarize, - remove_noise) - -TOOL = 'ocrd-cis-ocropy-denoise' +from .common import determine_zoom, remove_noise class OcropyDenoise(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-denoise' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyDenoise, self).__init__(*args, **kwargs) - - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Despeckle the pages / regions / lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested ``level-of-operation``. @@ -50,86 +34,55 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyDenoise') level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, - feature_selector='binarized' if level == 'page' else '') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - if level == 'page': - self.process_segment(page, page_image, page_xywh, zoom, - input_file.pageId, file_id) - else: - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, - feature_selector='binarized' if level == 'region' else '') - if level == 'region': - self.process_segment(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue - lines = region.get_TextLine() - if not lines: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, - feature_selector='binarized') - self.process_segment(line, line_image, line_xywh, zoom, - input_file.pageId, - file_id + '_' + region.id + '_' + line.id) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id): - LOG = getLogger('processor.OcropyDenoise') + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_selector='binarized' if level == 'page' else '') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + if level == 'page': + image = self.process_segment(page, page_image, page_xywh, zoom, page_id) + if image: + result.images.append(image) + else: + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning(f'Page "{page_id}" contains no text regions') + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, + feature_selector='binarized' if level == 'region' else '') + if level == 'region': + file_id = f"{page_id}_{region.id}" + image = self.process_segment(region, region_image, region_xywh, zoom, file_id) + if image: + result.images.append(image) + continue + lines = region.get_TextLine() + if not lines: + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, feature_selector='binarized') + file_id = f"{page_id}_{region.id}_{line.id}" + image = self.process_segment(line, line_image, line_xywh, zoom, file_id) + if image: + result.images.append(image) + return result + + def process_segment(self, segment, segment_image, segment_xywh, zoom, file_id) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: - LOG.warning("Skipping '%s' with zero size", file_id) - return - LOG.info("About to despeckle '%s'", file_id) - bin_image = remove_noise(segment_image, - maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt - # update METS (add the image file): - file_path = self.workspace.save_image_file( - bin_image, file_id + '.IMG-DESPECK', self.output_file_grp, - page_id=page_id) + self.logger.warning(f"Skipping '{segment.id}' with zero size") + return None + self.logger.info(f"About to despeckle '{segment.id}'") + bin_image = remove_noise( + segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72) # in pt # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=segment_xywh['features'] + ',despeckled')) + alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') + suffix = f"{file_id}.IMG-DESPECK" + segment.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 4ed04218..b02c69d5 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,29 +1,13 @@ from __future__ import absolute_import +from typing import Optional +from logging import Logger -import os.path +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - PageType, - to_xml, AlternativeImageType -) -from ocrd import Processor - -from .. import get_ocrd_tool from . import common -from .common import ( - pil2array -) - -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -TOOL = 'ocrd-cis-ocropy-deskew' +from .common import pil2array def deskew(pil_image, maxskew=2): array = pil2array(pil_image) @@ -31,17 +15,14 @@ def deskew(pil_image, maxskew=2): return angle class OcropyDeskew(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-deskew' - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools'][TOOL] - kwargs['version'] = ocrd_tool['version'] - super(OcropyDeskew, self).__init__(*args, **kwargs) - - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Deskew the pages or regions of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the TextRegion level. Next, for each file, crop each region image according to the layout @@ -56,95 +37,71 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyDeskew') level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id, + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_id, + # image must not have been rotated already, + # (we will overwrite @orientation anyway,) + # abort if no such image can be produced: + feature_filter='deskewed' if level == 'page' else '') + if level == 'page': + image = self._process_segment(page, page_image, page_coords, "page '%s'" % page_id, page_id) + if image: + result.images.append(image) + return result + if level == 'table': + regions = page.get_TableRegion() + else: # region + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + # process region: + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: - feature_filter='deskewed' if level == 'page' else '') - if level == 'page': - self._process_segment(page, page_image, page_coords, - "page '%s'" % page_id, input_file.pageId, - file_id) - else: - if level == 'table': - regions = page.get_TableRegion() - else: # region - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) - for region in regions: - # process region: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, - # image must not have been rotated already, - # (we will overwrite @orientation anyway,) - # abort if no such image can be produced: - feature_filter='deskewed') - self._process_segment(region, region_image, region_coords, - "region '%s'" % region.id, input_file.pageId, - file_id + '_' + region.id) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id): - LOG = getLogger('processor.OcropyDeskew') + feature_filter='deskewed') + image = self._process_segment(region, region_image, region_coords, f"region '{region.id}'", page_id) + if image: + result.images.append(image) + return result + + def _process_segment( + self, segment, segment_image, segment_coords, segment_id, page_id + ) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: - LOG.warning("Skipping %s with zero size", segment_id) - return - angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image - LOG.info("About to deskew %s", segment_id) - angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied + self.logger.warning("Skipping %s with zero size", segment_id) + return None + angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image + self.logger.info(f"About to deskew {segment_id}") + angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied # segment angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: orientation = -(angle + angle0) - orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] - segment.set_orientation(orientation) # also removes all deskewed AlternativeImages - LOG.info("Found angle for %s: %.1f", segment_id, angle) + orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] + segment.set_orientation(orientation) # also removes all deskewed AlternativeImages + self.logger.info(f"Found angle for {segment_id}: %.1f", angle) # delegate reflection, rotation and re-cropping to core: if isinstance(segment, PageType): segment_image, segment_coords, _ = self.workspace.image_from_page( - segment, page_id, - fill='background', transparency=True) + segment, page_id, fill='background', transparency=True) + suffix = '.IMG-DESKEW' else: segment_image, segment_coords = self.workspace.image_from_segment( - segment, segment_image, segment_coords, - fill='background', transparency=True) + segment, segment_image, segment_coords, fill='background', transparency=True) + suffix = segment.id + '.IMG-DESKEW' if not angle: # zero rotation does not change coordinates, # but assures consuming processors that the # workflow had deskewing segment_coords['features'] += ',deskewed' - # update METS (add the image file): - file_path = self.workspace.save_image_file( - segment_image, file_id + '.IMG-DESKEW', self.output_file_grp, - page_id=page_id) # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=segment_coords['features'])) + alternative = AlternativeImageType(comments=segment_coords['features']) + segment.add_AlternativeImage(alternative) + return OcrdPageResultImage(segment_image, suffix, alternative) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 7d3251bf..a0d0ea5c 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,30 +1,14 @@ from __future__ import absolute_import - -import os.path +from logging import Logger +from typing import Optional import numpy as np -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) from ocrd import Processor -from ocrd_utils import MIMETYPE_PAGE +from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage -from .. import get_ocrd_tool from .ocrolib import lineest -from .common import ( - pil2array, array2pil, - check_line, -) - -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) - -TOOL = 'ocrd-cis-ocropy-dewarp' +from .common import array2pil, check_line, determine_zoom, pil2array class InvalidLine(Exception): """Line image does not allow dewarping and should be ignored.""" @@ -37,27 +21,27 @@ def dewarp(image, lnorm, check=True, max_neighbour=0.02, zoom=1.0): if not image.width or not image.height: raise InvalidLine('image size is zero') line = pil2array(image) - + if np.prod(line.shape) == 0: raise InvalidLine('image dimensions are zero') if np.amax(line) == np.amin(line): raise InvalidLine('image is blank') - - temp = np.amax(line)-line # inverse, zero-closed + + temp = np.amax(line) - line # inverse, zero-closed if check: report = check_line(temp, zoom=zoom) if report: raise InadequateLine(report) - - temp = temp * 1.0 / np.amax(temp) # normalized + + temp = temp * 1.0 / np.amax(temp) # normalized if check: report = lnorm.check(temp, max_ignore=max_neighbour) if report: raise InvalidLine(report) - lnorm.measure(temp) # find centerline + lnorm.measure(temp) # find centerline line = lnorm.dewarp(line, cval=np.amax(line)) - + return array2pil(line) # pad with white above and below (as a fallback for dewarp) @@ -69,32 +53,14 @@ def padvert(image, range_): return array2pil(line) class OcropyDewarp(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-dewarp' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyDewarp, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() - - def setup(self): - # defaults from ocrolib.lineest: - self.lnorm = lineest.CenterNormalizer( - params=(self.parameter['range'], - self.parameter['smoothness'], - # let's not expose this for now - # (otherwise we must explain mutual - # dependency between smoothness - # and extra params) - 0.3)) - self.logger = getLogger('processor.OcropyDewarp') - - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Dewarp the lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the TextLine level. Next, get each line image according to the layout annotation (from @@ -110,81 +76,49 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id) - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) - - lines = region.get_TextLine() - if not lines: - self.logger.warning('Region %s contains no text lines', region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh) - - self.logger.info("About to dewarp page '%s' region '%s' line '%s'", - page_id, region.id, line.id) - try: - dew_image = dewarp(line_image, self.lnorm, check=True, - max_neighbour=self.parameter['max_neighbour'], - zoom=zoom) - except InvalidLine as err: - self.logger.error('cannot dewarp line "%s": %s', line.id, err) - continue - except InadequateLine as err: - self.logger.warning('cannot dewarp line "%s": %s', line.id, err) - # as a fallback, simply pad the image vertically - # (just as dewarping would do on average, so at least - # this line has similar margins as the others): - dew_image = padvert(line_image, self.parameter['range']) - # update METS (add the image file): - file_path = self.workspace.save_image_file( - dew_image, - file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP', - self.output_file_grp, - page_id=input_file.pageId) - # update PAGE (reference the image file): - alternative_image = line.get_AlternativeImage() - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=line_xywh['features'] + ',dewarped')) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + # defaults from ocrolib.lineest: + lnorm = lineest.CenterNormalizer( + params=(self.parameter['range'], + self.parameter['smoothness'], + # let's not expose this for now + # (otherwise we must explain mutual + # dependency between smoothness + # and extra params) + 0.3)) + + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning(f'Page "{page_id}" contains no text regions') + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) + lines = region.get_TextLine() + if not lines: + self.logger.warning(f'Region {region.id} contains no text lines') + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh) + self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") + try: + dew_image = dewarp( + line_image, lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) + except (InvalidLine, AssertionError) as err: + self.logger.error(f'Cannot dewarp line "{line.id}": {err}') + continue + except InadequateLine as err: + self.logger.warning(f'cannot dewarp line "{line.id}": {err}') + # as a fallback, simply pad the image vertically + # (just as dewarping would do on average, so at least + # this line has similar margins as the others): + dew_image = padvert(line_image, self.parameter['range']) + # update PAGE (reference the image file): + alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped') + line.add_AlternativeImage(alt_image) + suffix = f"{region.id}_{line.id}.IMG-DEWARP" + result.images.append(OcrdPageResultImage(dew_image, suffix, alt_image)) + return result diff --git a/ocrd_cis/ocropy/ocrolib/lineest.py b/ocrd_cis/ocropy/ocrolib/lineest.py index 42ef2237..392c7e4a 100644 --- a/ocrd_cis/ocropy/ocrolib/lineest.py +++ b/ocrd_cis/ocropy/ocrolib/lineest.py @@ -75,7 +75,7 @@ def measure(self,line): plt.plot(self.center) plt.ginput(1,1000) def dewarp(self,img,cval=0,dtype=np.dtype('f')): - assert img.shape==self.shape + assert img.shape==self.shape, f"input shape {img.shape} deviates from measured shape {self.shape}" h,w = img.shape # The actual image img is embedded into a larger image by # adding vertical space on top and at the bottom (padding) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index 7d6ffc85..4b626e83 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -343,7 +343,7 @@ def select_regions(binary,f,min=0,nbest=100000): return keep[labels] @checks(SEGMENTATION) -def all_neighbors(image, dist=1, bg=NaN): +def all_neighbors(image, dist=1, bg=float('nan')): """Given an image with labels, find all pairs of labels that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``.""" q = 100000 @@ -429,7 +429,7 @@ def reading_order(seg,rl=False,bt=False): segmap[1:] = 1 return segmap def pos(f,l): - return array([f(x) if x else nan for x in l]) + return array([f(x) if x else float('nan') for x in l]) ys = pos(sl.ycenter,objects) yorder = argsort(ys)[::-1 if bt else 1] groups = [[yorder[0]]] diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 74d858ab..97bec8a7 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,36 +1,22 @@ from __future__ import absolute_import -import sys -import os.path +from logging import Logger +from sys import exit +from typing import Any, Optional +from os import access, R_OK +from os.path import abspath, dirname, isfile, join import numpy as np from PIL import Image from rapidfuzz.distance import Levenshtein -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - coordinates_for_segment, - polygon_from_bbox, - points_from_polygon, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, TextEquivType, - CoordsType, GlyphType, WordType -) -from ocrd import Processor - -from .. import get_ocrd_tool +from ocrd_utils import coordinates_for_segment, points_from_polygon, polygon_from_bbox +from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType +from ocrd import Processor, OcrdPageResult + +from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange -from .common import ( - pil2array, - check_line -) -TOOL = 'ocrd-cis-ocropy-recognize' def resize_keep_ratio(image, baseheight=48): scale = baseheight / image.height @@ -59,8 +45,8 @@ def recognize(image, pad, network, check=True): pred = network.predictString(line) # getting confidence - result = lstm.translate_back(network.outputs, pos=1) - scale = len(raw_line.T)*1.0/(len(network.outputs)-2*pad) + result = lstm.translate_back(network.outputs, pos=1) # raw positions + scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad) clist = [] rlist = [] @@ -70,7 +56,7 @@ def recognize(image, pad, network, check=True): if c != 0: confid = network.outputs[r, c] c = network.l2s([c]) - r = (r-pad)*scale + r = (r - pad) * scale confidlist.append(confid) clist.append(c) @@ -80,20 +66,17 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): + network: Any + pad: int + # lstm is not thread-safe (.outputs, .last_n as side effects etc) + max_workers = 1 + + @property + def executable(self): + return 'ocrd-cis-ocropy-recognize' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - self.pad = 16 # ocropus-rpred default - self.network = None # set in process - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyRecognize, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() - def setup(self): - self.logger = getLogger('processor.OcropyRecognize') + self.pad = 16 # from ocropus-rpred: self.network = load_object(self.get_model(), verbose=1) for x in self.network.walk(): @@ -104,35 +87,36 @@ def setup(self): def get_model(self): """Search for the model file. First checks if parameter['model'] can - be resolved with OcrdResourceManager to a valid readeable file and + be resolved with OcrdResourceManager to a valid readable file and returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" - canread = lambda p: os.path.isfile(p) and os.access(p, os.R_OK) + canread = lambda p: isfile(p) and access(p, R_OK) + p_model = self.parameter['model'] try: - model = self.resolve_resource(self.parameter['model']) + model = self.resolve_resource(p_model) if canread(model): return model except SystemExit: - ocropydir = os.path.dirname(os.path.abspath(__file__)) - path = os.path.join(ocropydir, 'models', self.parameter['model']) - self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path) + ocropydir = dirname(abspath(__file__)) + path = join(ocropydir, 'models', p_model) + self.logger.info(f"Failed to resolve model with OCR-D/core mechanism, trying {path}") if canread(path): return path - self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s", - self.parameter['model'], self.parameter['model']) - sys.exit(1) + self.logger.error( + f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}") + exit(1) - def process(self): - """Recognize lines / words / glyphs of the workspace. + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: + """Recognize lines / words / glyphs of a page. - Open and deserialise each PAGE input file and its respective image, + Open and deserialize the PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``textequiv_level``. If any layout annotation below the line level already exists, then remove it (regardless of ``textequiv_level``). - Set up Ocropy to recognise each text line (via coordinates into + Set up Ocropy to recognize each text line (via coordinates into the higher-level image, or from the alternative image; the image - must have been binarised/grayscale-normalised, deskewed and dewarped + must have been binarized/grayscale-normalised, deskewed and dewarped already). Rescale and pad the image, then recognize. Create new elements below the line level, if necessary. @@ -145,105 +129,80 @@ def process(self): Levenshtein distance. Aggregate these scores for each file and print the line-wise and the total character error rates (CER). - Produce a new output file by serialising the resulting hierarchy. + Return the resulting OcrdPage. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - maxlevel = self.parameter['textequiv_level'] - - # self.logger.info("Using model %s in %s for recognition", model) - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id) - - self.logger.info("Recognizing text in page '%s'", page_id) - # region, line, word, or glyph level: - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning("Page '%s' contains no text regions", page_id) - self.process_regions(regions, maxlevel, page_image, page_coords) - - # update METS (add the PAGE file): - file_id = make_file_id(input_file, self.output_file_grp) - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def process_regions(self, regions, maxlevel, page_image, page_coords): + max_level = self.parameter['textequiv_level'] + assert self.workspace + self.logger.debug(f'Max level: "{max_level}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + + page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id) + self.logger.info(f"Recognizing text in page '{page_id}'") + # region, line, word, or glyph level: + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning(f"Page '{page_id}' contains no text regions") + self.process_regions(regions, max_level, page_image, page_xywh) + return OcrdPageResult(pcgts) + + def process_regions(self, regions, maxlevel, page_image, page_xywh): edits = 0 lengs = 0 for region in regions: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords) - - self.logger.info("Recognizing text in region '%s'", region.id) + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) + self.logger.info(f"Recognizing text in region '{region.id}'") textlines = region.get_TextLine() if not textlines: - self.logger.warning("Region '%s' contains no text lines", region.id) + self.logger.warning(f"Region '{region.id}' contains no text lines") else: - edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_coords) + edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_xywh) edits += edits_ lengs += lengs_ # update region text by concatenation for consistency - region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode - if line.get_TextEquiv() - else u'' for line in textlines) + region_unicode = u'\n'.join( + line.get_TextEquiv()[0].Unicode if line.get_TextEquiv() else u'' for line in textlines) region.set_TextEquiv([TextEquivType(Unicode=region_unicode)]) if lengs > 0: self.logger.info('CER: %.1f%%', 100.0 * edits / lengs) - def process_lines(self, textlines, maxlevel, region_image, region_coords): + def process_lines(self, textlines, maxlevel, region_image, region_xywh): edits = 0 lengs = 0 for line in textlines: - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords) - - self.logger.info("Recognizing text in line '%s'", line.id) + line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) + self.logger.info(f"Recognizing text in line '{line.id}'") if line.get_TextEquiv(): linegt = line.TextEquiv[0].Unicode else: linegt = '' - self.logger.debug("GT '%s': '%s'", line.id, linegt) + self.logger.debug(f"GT '{line.id}': '{linegt}'") # remove existing annotation below line level: line.set_TextEquiv([]) line.set_Word([]) if line_image.size[1] < 16: - self.logger.debug("ERROR: bounding box is too narrow at line %s", line.id) + self.logger.debug(f"Error: bounding box is too narrow at line {line.id}") continue # resize image to 48 pixel height final_img, scale = resize_keep_ratio(line_image) # process ocropy: try: - linepred, clist, rlist, confidlist = recognize( - final_img, self.pad, self.network, check=True) + linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: - self.logger.debug('error processing line "%s": %s', line.id, err) + self.logger.debug(f'Error processing line "{line.id}": {str(err) or err.__class__.__name__}') continue - self.logger.debug("OCR '%s': '%s'", line.id, linepred) + self.logger.debug(f"OCR '{line.id}': '{linepred}'") edits += Levenshtein.distance(linepred, linegt) lengs += len(linegt) words = [x.strip() for x in linepred.split(' ') if x.strip()] - word_r_list = [[0]] # r-positions of every glyph in every word - word_conf_list = [[]] # confidences of every glyph in every word + word_r_list = [[0]] # r-positions of every glyph in every word + word_conf_list = [[]] # confidences of every glyph in every word if words != []: w_no = 0 found_char = False @@ -252,12 +211,10 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): found_char = True word_conf_list[w_no].append(confidlist[i]) word_r_list[w_no].append(rlist[i]) - if c == ' ' and found_char: if i == 0: word_r_list[0][0] = rlist[i] - - elif i+1 <= len(clist)-1 and clist[i+1] != ' ': + elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ': word_conf_list.append([]) word_r_list.append([rlist[i]]) w_no += 1 @@ -266,44 +223,38 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): word_r_list = [[0, line_image.width]] # conf for each word - wordsconf = [(min(x)+max(x))/2 for x in word_conf_list] + wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list] # conf for the line - line_conf = (min(wordsconf) + max(wordsconf))/2 + line_conf = (min(wordsconf) + max(wordsconf)) / 2 # line text - line.add_TextEquiv(TextEquivType( - Unicode=linepred, conf=line_conf)) + line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf)) if maxlevel in ['word', 'glyph']: for word_no, word_str in enumerate(words): word_points = points_from_polygon( coordinates_for_segment( np.array(polygon_from_bbox( - word_r_list[word_no][0] / scale, - 0, - word_r_list[word_no][-1] / scale, - 0 + line_image.height)), + word_r_list[word_no][0] / scale,0, + word_r_list[word_no][-1] / scale, 0 + line_image.height)), line_image, line_coords)) word_id = '%s_word%04d' % (line.id, word_no) word = WordType(id=word_id, Coords=CoordsType(word_points)) line.add_Word(word) - word.add_TextEquiv(TextEquivType( - Unicode=word_str, conf=wordsconf[word_no])) + word.add_TextEquiv(TextEquivType(Unicode=word_str, conf=wordsconf[word_no])) if maxlevel == 'glyph': for glyph_no, glyph_str in enumerate(word_str): glyph_points = points_from_polygon( coordinates_for_segment( np.array(polygon_from_bbox( - word_r_list[word_no][glyph_no] / scale, - 0, - word_r_list[word_no][glyph_no+1] / scale, - 0 + line_image.height)), + word_r_list[word_no][glyph_no] / scale, 0, + word_r_list[word_no][glyph_no + 1] / scale, 0 + line_image.height)), line_image, line_coords)) glyph_id = '%s_glyph%04d' % (word.id, glyph_no) glyph = GlyphType(id=glyph_id, Coords=CoordsType(glyph_points)) word.add_Glyph(glyph) - glyph.add_TextEquiv(TextEquivType( - Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) + glyph.add_TextEquiv( + TextEquivType(Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) return edits, lengs diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index a337b5e0..0fb133c0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,35 +1,29 @@ from __future__ import absolute_import -import os.path +from typing import Optional +from logging import Logger + import numpy as np from skimage import draw, segmentation from shapely.geometry import Polygon, LineString from shapely.prepared import prep -from shapely.ops import unary_union -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, PageType, BaselineType -) -from ocrd import Processor from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, coordinates_for_segment, points_from_polygon, polygon_from_points, transform_coordinates, - MIMETYPE_PAGE ) +from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage +from ocrd import Processor, OcrdPageResult -from .. import get_ocrd_tool from .ocrolib import midrange, morph from .common import ( pil2array, odd, DSAVE, + determine_zoom, # binarize, check_page, check_region, @@ -46,20 +40,15 @@ diff_polygons ) -TOOL = 'ocrd-cis-ocropy-resegment' - class OcropyResegment(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-resegment' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super().__init__(*args, **kwargs) - - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Resegment lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the line level. Next, get the page image according to the layout annotation (from @@ -98,7 +87,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyResegment') # This makes best sense for bad/coarse line segmentation, like current GT # or as postprocessing for bbox-only steps like Tesseract. # Most notably, it can convert rectangles to polygons (polygonalization), @@ -109,82 +97,51 @@ def process(self): # accuracy crucially depends on a good estimate of the images' # pixel density (at least if source input is not 300 DPI). level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - - for n, input_file in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) + pcgts = input_pcgts[0] + page = pcgts.get_Page() - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID - page = pcgts.get_Page() + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi + ignore = (page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_SeparatorRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning(f'Page "{page_id}" contains no text regions') + elif level == 'page': + lines = [line for region in regions + for line in region.get_TextLine()] + if lines: + self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) else: - zoom = 1 - - ignore = (page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_SeparatorRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - regions = page.get_AllRegions(classes=['Text']) - if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) - elif level == 'page': - lines = [line for region in regions - for line in region.get_TextLine()] + self.logger.warning(f'Page "{page_id}" contains no text regions with lines', ) + else: + for region in regions: + lines = region.get_TextLine() if lines: - self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) else: - LOG.warning('Page "%s" contains no text regions with lines', page_id) - else: - for region in regions: - lines = region.get_TextLine() - if lines: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) - else: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') + return OcrdPageResult(pcgts) def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): - LOG = getLogger('processor.OcropyResegment') threshold = self.parameter['min_fraction'] method = self.parameter['method'] - maxdist = self.parameter['spread']/zoom*300/72 # in pt + maxdist = self.parameter['spread'] / zoom * 300 / 72 # in pt # prepare line segmentation parent_array = pil2array(parent_image) #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw @@ -199,8 +156,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l fullpage = False report = check_region(parent_bin, zoom) if report: - LOG.warning('Invalid %s "%s": %s', tag, - page_id if fullpage else parent.id, report) + self.logger.warning(f'Invalid {tag} "{page_id if fullpage else parent.id}": {report}') return # get existing line labels: line_labels = np.zeros_like(parent_bin, bool) @@ -209,7 +165,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for i, line in enumerate(lines): if self.parameter['baseline_only'] and line.Baseline: line_base = baseline_of_segment(line, parent_coords) - line_poly = polygon_from_baseline(line_base, 30/zoom) + line_poly = polygon_from_baseline(line_base, 30 / zoom) else: line_poly = coordinates_of_segment(line, parent_image, parent_coords) line_poly = make_valid(Polygon(line_poly)) @@ -221,39 +177,32 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # (causing negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does not need # to concern herself with this. - line_y, line_x = draw.polygon(polygon[:, 1], - polygon[:, 0], - parent_bin.shape) + line_y, line_x = draw.polygon(polygon[:, 1], polygon[:, 0], parent_bin.shape) line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines for i, region in enumerate(set(line.parent_object_ for line in lines)): - LOG.debug('unmasking area of text region "%s" for "%s"', - region.id, page_id if fullpage else parent.id) + self.logger.debug(f'Unmasking area of text region "{region.id}" for "{page_id if fullpage else parent.id}"') region_polygon = coordinates_of_segment(region, parent_image, parent_coords) region_polygon = make_valid(Polygon(region_polygon)) region_polygon = np.array(region_polygon.exterior.coords, int)[:-1] - ignore_bin[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - parent_bin.shape)] = False + ignore_bin[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): - LOG.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4], - segment.id, page_id if fullpage else parent.id) + self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for ' + f'"{page_id if fullpage else parent.id}"') segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - ignore_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - parent_bin.shape)] = True + ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True if method != 'lineest': - LOG.debug('calculating connected component and distance transforms for "%s"', parent.id) + self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"') bin = parent_bin & ~ ignore_bin components, _ = morph.label(bin) # estimate glyph scale (roughly) _, counts = np.unique(components, return_counts=True) if counts.shape[0] > 1: counts = np.sqrt(3 * counts) - scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)])) - components *= (counts > 15/zoom)[components] - LOG.debug("estimated scale: %d", scale) + scale = int(np.median(counts[(5 / zoom < counts) & (counts < 100 / zoom)])) + components *= (counts > 15 / zoom)[components] + self.logger.debug(f"Estimated scale: {scale}") else: scale = 43 if method == 'ccomps': @@ -271,7 +220,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_labels = np.zeros_like(parent_bin, np.uint8) for i, line in enumerate(lines): if line.Baseline is None: - LOG.warning("Skipping '%s' without baseline", line.id) + self.logger.warning(f"Skipping '{line.id}' without baseline") new_labels[line_labels[i]] = i + 1 continue line_baseline = baseline_of_segment(line, parent_coords) @@ -281,27 +230,27 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_polygon[:, 0], parent_bin.shape) new_labels[line_y, line_x] = i + 1 - spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords, - maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold) + spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords, + maxdist=maxdist or scale / 2, loc=parent.id, threshold=threshold) return try: + # TODO: 'scale' passed as a param may not be always defined (mehmedGIT) new_line_labels, new_baselines, _, _, _, scale = compute_segmentation( - parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2, + parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale / 2, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: - LOG.error('Cannot line-segment %s "%s": %s', - tag, page_id if fullpage else parent.id, err) + self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}') return - LOG.info("Found %d new line labels for %d existing lines on %s '%s'", - new_line_labels.max(), len(lines), tag, parent.id) + self.logger.info( + f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'") # polygonalize and prepare comparison new_line_polygons, new_line_labels = masks2polygons( - new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), - min_area=640/zoom/zoom) + self.logger, new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"', + min_area=640 / zoom / zoom) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) - new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base)) - for _, poly, base in new_line_polygons])) or ([], []) + new_line_polygons, new_baselines = list(zip( + *[(Polygon(poly), LineString(base)) for _, poly, base in new_line_polygons])) or ([], []) # polygons for intersecting pairs intersections = dict() # ratio of overlap between intersection and new line @@ -319,12 +268,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l inter = make_intersection(line_poly.context, new_line_poly) if not inter: continue - new_line_mask = (new_line_labels == i+1) & parent_bin + new_line_mask = (new_line_labels == i + 1) & parent_bin line_mask = line_labels[j] & parent_bin inter_mask = new_line_mask & line_mask if (not np.count_nonzero(inter_mask) or - not np.count_nonzero(new_line_mask) or - not np.count_nonzero(line_mask)): + not np.count_nonzero(new_line_mask) or + not np.count_nonzero(line_mask)): continue intersections[(i, j)] = inter fits_bg[i, j] = inter.area / new_line_poly.area @@ -380,47 +329,43 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for j, line in enumerate(lines): new_lines = np.nonzero(assignments == j)[0] if not np.prod(new_lines.shape): - LOG.debug("no lines for '%s' match or fit", line.id) + self.logger.debug(f"no lines for '{line.id}' match or fit", ) continue - covers = np.sum(covers_bg[new_lines,j]) + covers = np.sum(covers_bg[new_lines, j]) if covers < threshold / 3: - LOG.debug("new lines for '%s' only cover %.1f%% bg", - line.id, covers * 100) + self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100) continue - covers = np.sum(covers_fg[new_lines,j]) + covers = np.sum(covers_fg[new_lines, j]) if covers < threshold: - LOG.debug("new lines for '%s' only cover %.1f%% fg", - line.id, covers * 100) + self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100) continue - looses = (assignments < 0) & (covers_bg[:,j] > 0.1) + looses = (assignments < 0) & (covers_bg[:, j] > 0.1) if looses.any(): - covers = np.sum(covers_bg[np.nonzero(looses)[0],j]) - LOG.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg", - line.id, np.count_nonzero(looses), covers * 100) + covers = np.sum(covers_bg[np.nonzero(looses)[0], j]) + self.logger.debug( + f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments " + f"totalling %.1f%% bg", covers * 100) continue line_count = np.count_nonzero(line_labels[j] & parent_bin) new_count = covers * line_count - LOG.debug('Black pixels before/after resegment of line "%s": %d/%d', - line.id, line_count, new_count) + self.logger.debug(f'Black pixels before/after resegment of line "{line.id}": {line_count}/{new_count}') # combine all assigned new lines to single outline polygon if len(new_lines) > 1: - LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) - new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] - for i in new_lines], loc=line.id, scale=scale) - new_baseline = join_baselines([new_polygon.intersection(new_baselines[i]) - for i in new_lines], loc=line.id) + self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'") + # intersections[(i, j)] + new_polygon = join_polygons([new_line_polygons[i] for i in new_lines], loc=line.id, scale=scale) + new_baseline = join_baselines( + self.logger, [new_polygon.intersection(new_baselines[i]) for i in new_lines], loc=line.id) # convert back to absolute (page) coordinates: - line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], - parent_image, parent_coords) + line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) line_polygon = polygon_for_parent(line_polygon, line.parent_object_) if line_polygon is None: - LOG.warning("Ignoring extant new polygon for line '%s'", line.id) + self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'") return # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) if new_baseline is not None: - new_baseline = coordinates_for_segment(new_baseline.coords, - parent_image, parent_coords) + new_baseline = coordinates_for_segment(new_baseline.coords, parent_image, parent_coords) line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline))) line_polygons[j] = prep(new_polygon) # now also ensure the assigned lines do not overlap other existing lines @@ -429,26 +374,27 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if j == otherj: continue otherline = lines[otherj] - LOG.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) + self.logger.debug(f"subtracting new '{line.id}' from overlapping '{otherline.id}'") other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon) if other_polygon.is_empty: continue # convert back to absolute (page) coordinates: - other_polygon = coordinates_for_segment(other_polygon.exterior.coords[:-1], - parent_image, parent_coords) + other_polygon = coordinates_for_segment( + other_polygon.exterior.coords[:-1], parent_image, parent_coords) other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_) if other_polygon is None: - LOG.warning("Ignoring extant new polygon for line '%s'", otherline.id) + self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'") continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) -def spread_dist(lines, old_labels, new_labels, binarized, components, coords, - maxdist=43, loc='', threshold=0.9): + +def spread_dist( + logger: Logger, lines, old_labels, new_labels, binarized, components, coords, maxdist=43, loc='', + threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" - LOG = getLogger('processor.OcropyResegment') DSAVE('seeds', [new_labels, (components>0)]) # allocate to connected components consistently - # (ignoring smallest components like punctuation) + # (ignoring the smallest components like punctuation) # but when there are conflicts, meet in the middle via watershed new_labels2 = morph.propagate_labels(components > 0, new_labels, conflict=0) new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0)) @@ -456,7 +402,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, # dilate/grow labels from connected components against each other and bg new_labels = morph.spread_labels(new_labels2, maxdist=maxdist) DSAVE('spread', new_labels) - # now propagate again to catch smallest components like punctuation + # now propagate again to catch the smallest components like punctuation new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0) new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized) DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)]) @@ -470,41 +416,37 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, continue count = np.count_nonzero(old_label) if not count: - LOG.warning("skipping zero-area line '%s'", line.id) + logger.warning(f"skipping zero-area line '{line.id}'") continue covers = np.count_nonzero(new_label) / count if covers < threshold / 3: - LOG.debug("new line for '%s' only covers %.1f%% bg", - line.id, covers * 100) + logger.debug(f"new line for '{line.id}' only covers %.1f%% bg", covers * 100) continue count = np.count_nonzero(old_label * binarized) if not count: - LOG.warning("skipping binary-empty line '%s'", line.id) + logger.warning(f"skipping binary-empty line '{line.id}'") continue covers = np.count_nonzero(new_label * binarized) / count if covers < threshold: - LOG.debug("new line for '%s' only covers %.1f%% fg", - line.id, covers * 100) + logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100) continue - LOG.debug('Black pixels before/after resegment of line "%s": %d/%d', - line.id, count, covers * count) - contours = [contour[:,::-1] # get x,y order again + logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}') + contours = [contour[:, :: -1] # get x,y order again for contour, area in morph.find_contours(new_label)] #LOG.debug("joining %d subsegments for %s", len(contours), line.id) if len(contours) == 0: - LOG.warning("no contours for %s - keeping", line.id) + logger.warning(f"no contours for {line.id} - keeping") continue else: # get alpha shape - poly = join_polygons([make_valid(Polygon(contour)) - for contour in contours - if len(contour) >= 4], - loc=line.id, scale=maxdist) + poly = join_polygons( + [make_valid(Polygon(contour)) for contour in contours if len(contour) >= 4], + loc=line.id, scale=maxdist) poly = poly.exterior.coords[:-1] polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) if polygon is None: - LOG.warning("Ignoring extant line for %s", line.id) + logger.warning(f"Ignoring extant line for {line.id}") continue line.get_Coords().set_points(points_from_polygon(polygon)) @@ -516,9 +458,8 @@ def baseline_of_segment(segment, coords): # zzz should go into core ocrd_utils def polygon_from_baseline(baseline, scale): - ltr = baseline[0,0] < baseline[-1,0] + ltr = baseline[0, 0] < baseline[-1, 0] # left-hand side if left-to-right, and vice versa - polygon = make_valid(join_polygons([LineString(baseline).buffer(scale * (-1) ** ltr, - single_sided=True)], - scale=scale)) + polygon = make_valid(join_polygons( + [LineString(baseline).buffer(scale * (-1) ** ltr, single_sided=True)], scale=scale)) return polygon diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 49cb6776..493deb30 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,7 +1,9 @@ from __future__ import absolute_import -import os.path +from typing import Optional +from logging import Logger import itertools + import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree from skimage import draw @@ -13,14 +15,20 @@ from shapely.validation import explain_validity from shapely import set_precision -from ocrd_modelfactory import page_from_file +from ocrd_utils import ( + coordinates_of_segment, + coordinates_for_segment, + points_from_polygon, + polygon_from_points, +) from ocrd_models.ocrd_page import ( - to_xml, CoordsType, + CoordsType, TextLineType, TextRegionType, SeparatorRegionType, PageType, - AlternativeImageType + AlternativeImageType, + OcrdPage ) from ocrd_models.ocrd_page_generateds import ( BaselineType, @@ -35,32 +43,23 @@ ReadingOrderType ) from ocrd import Processor -from ocrd_utils import ( - getLogger, - make_file_id, - assert_file_grp_cardinality, - coordinates_of_segment, - coordinates_for_segment, - points_from_polygon, - polygon_from_points, - MIMETYPE_PAGE -) +from ocrd.processor import OcrdPageResult, OcrdPageResultImage -from .. import get_ocrd_tool from .ocrolib import midrange from .ocrolib import morph from .common import ( pil2array, array2pil, check_page, check_region, + determine_zoom, hmerge_line_seeds, compute_segmentation, lines2regions ) -TOOL = 'ocrd-cis-ocropy-segment' -def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): +def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, + reorder=True): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -77,11 +76,11 @@ def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=N - these polygons as a list of label, polygon, baseline tuples, and - a Numpy array of new background labels for that list. """ - LOG = getLogger('processor.OcropySegment') # find sharp baseline if baselines is not None: def getx(xy): return xy[0] + baselines = [LineString(sorted([p[::-1] for p in line], key=getx)).simplify(5) for line in baselines if len(line) >= 2] @@ -94,17 +93,15 @@ def getx(xy): bg_mask = np.array(bg_labels == label, bool) if not np.count_nonzero(bg_mask * fg_bin): # ignore if missing foreground - LOG.debug('skipping label %d in %s due to empty fg', - label, name) + logger.debug(f'Skipping label {label} in {name} due to empty fg') continue # simplify to convex hull if simplify is not None: hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(bool) - conflicts = np.setdiff1d(hull * simplify, - bg_mask * simplify) + conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): - LOG.debug('Cannot simplify %d: convex hull would create additional intersections %s', - label, str(conflicts)) + logger.debug( + f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}') else: bg_mask = hull if open_holes: @@ -132,8 +129,8 @@ def getx(xy): if len(hole) < 3: idx_hole = hier[0, idx_hole, 0] continue - LOG.debug("label %d contour %d [%d pts] has hole %d [%d pts]", - label, idx, len(contour), idx_hole, len(hole)) + logger.debug( + f"Label {label} contour {idx} [{len(contour)} pts] has hole {idx_hole} [{len(hole)} pts]") #plot_poly(hole, 'blue') # cut child from outside... # first get nearest point on child @@ -147,10 +144,10 @@ def getx(xy): contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(int)[:,0] // 10) interpol = [] for i, ntics in enumerate(contourtics): - interpol.extend(np.array(contour[i:i+1] + - contour2[i:i+1] * - np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis], - int)) + interpol.extend(np.array( + contour[i:i + 1] + + contour2[i:i + 1] * + np.linspace(0, 1, ntics)[:, np.newaxis, np.newaxis], int)) interpol.append(contour[-1]) interpol = np.array(interpol) contourtics = np.insert(np.cumsum(contourtics), 0, 0) @@ -163,27 +160,28 @@ def getx(xy): contour_idx2 = contour_idx if contour_idx2 >= len(contour): contour_idx2 = 0 - cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx+1] + cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx + 1] if interpol_idx == 0: diff1 = (interpol[-1:] - cispoint1) // 5 else: - diff1 = (interpol[interpol_idx-1:interpol_idx] - cispoint1) // 5 + diff1 = (interpol[interpol_idx - 1: interpol_idx] - cispoint1) // 5 if interpol_idx + 1 >= len(interpol): diff2 = (interpol[0:1] - cispoint2) // 5 else: - diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5 + diff2 = (interpol[interpol_idx + 1: interpol_idx + 2] - cispoint2) // 5 cispoint1 = cispoint1 + diff1 cispoint2 = cispoint2 + diff2 - LOG.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx) + logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}") # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) # (this works, because inner contours have inverse direction) - contour = np.concatenate([contour[:contour_idx], cispoint1, - hole[hole_idx:], hole[:hole_idx], - cispoint2, contour[contour_idx:]]) + contour = np.concatenate( + [contour[:contour_idx], cispoint1, + hole[hole_idx:], hole[:hole_idx], + cispoint2, contour[contour_idx:]]) #plot_poly(contour, 'green') idx_hole = hier[0, idx_hole, 0] #plot_poly(contour, 'red') - LOG.debug("adding label %d contour %d [%d pts]", label, idx, len(contour)) + logger.debug(f"Adding label {label} contour {idx} [{len(contour)} pts]") contours.append(contour) idx = hier[0, idx, 0] else: @@ -209,55 +207,52 @@ def getx(xy): contour = contours[i] area = areas[i] if min_area and area < min_area and area / total_area < 0.1: - LOG.warning('Label %d contour %d is too small (%d/%d) in %s', - label, i, area, total_area, name) + logger.warning(f'Label {label} contour {i} is too small ({area}/{total_area}) in {name}') continue # simplify shape: # can produce invalid (self-intersecting) polygons: #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y - polygon = contour[:, 0, ::] # already ordered x,y + polygon = contour[:, 0, ::] # already ordered x,y # simplify and validate: polygon = Polygon(polygon) if not polygon.is_valid: - #LOG.debug(polygon.wkt) - LOG.debug(explain_validity(polygon)) + #logger.debug(polygon.wkt) + logger.debug(explain_validity(polygon)) polygon = make_valid(polygon) if not polygon.is_valid: #LOG.debug(polygon.wkt) - LOG.warning(explain_validity(polygon)) - poly = polygon.exterior.coords[:-1] # keep open + logger.warning(explain_validity(polygon)) + poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: - LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name) + logger.warning(f'Label {label} contour {i} for {name} has less than 4 points') continue # get baseline segments intersecting with this line mask # and concatenate them from left to right if baselines is not None: - base = join_baselines([baseline.intersection(polygon) - for baseline in baselines - if baseline.intersects(polygon)], name) + base = join_baselines( + logger, + [baseline.intersection(polygon) for baseline in baselines if baseline.intersects(polygon)], name) if base is not None: base = base.coords else: base = None results.append((label, poly, base)) - result_labels[contour_labels == i+1] = len(results) + result_labels[contour_labels == i + 1] = len(results) return results, result_labels class OcropySegment(Processor): + @property + def executable(self): + return 'ocrd-cis-ocropy-segment' - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropySegment, self).__init__(*args, **kwargs) - - def process(self): + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Segment pages into regions+lines, tables into cells+lines, or regions into lines. - - Open and deserialise PAGE input files and their respective images, + + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested level. - + + \b Depending on ``level-of-operation``, consider existing segments: - If ``overwrite_separators=True`` on ``page`` level, then delete any SeparatorRegions. @@ -270,12 +265,13 @@ def process(self): - If ``overwrite_order=True`` on ``page`` or ``table`` level, then delete the reading order OrderedGroup entry corresponding to the (page/table) segment. - + Next, get each element image according to the layout annotation (from the alternative image of the page/region, or by cropping via coordinates into the higher-level image) in binarized form, and represent it as an array with non-text regions and (remaining) text neighbours suppressed. - + + \b Then compute a text line segmentation for that array (as a label mask). When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting @@ -284,25 +280,26 @@ def process(self): - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. - + Text regions are detected via a hybrid variant recursive X-Y cut algorithm (RXYC): RXYC partitions the binarized image in top-down manner by detecting horizontal or vertical gaps. This implementation uses the bottom-up text line segmentation to guide the search, and also uses both pre-existing and newly detected separators to alternatively partition the respective boxes into non-rectangular parts. - + During line segmentation, suppress the foreground of all previously annotated regions (of any kind) and lines, except if just removed due to ``overwrite``. During region aggregation however, combine the existing separators with the new-found separators to guide the column search. - + All detected segments (both text line and text region) are sorted according to their reading order (assuming a top-to-bottom, left-to-right ordering). When ``level-of-operation`` is ``page``, prefer vertical (column-first) succession of regions. When it is ``table``, prefer horizontal (row-first) succession of cells. - + + \b Then for each resulting segment label, convert its background mask into polygon outlines by finding the outer contours consistent with the element's polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: @@ -314,10 +311,9 @@ def process(self): - If it is ``page``, then append the new lines to their respective regions, and append the new regions to the page. (Also, create an OrderedGroup for it in the ReadingOrder.) - + Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropySegment') # FIXME: allow passing a-priori info on reading order / textline order # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture # of different scripts; also, vertical writing needs internal rotation @@ -328,222 +324,196 @@ def process(self): overwrite_order = self.parameter['overwrite_order'] oplevel = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() - for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) + # TODO: also allow grayscale_normalized (try/except?) + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - # TODO: also allow grayscale_normalized (try/except?) - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + # aggregate existing regions so their foreground can be ignored + ignore = (page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + if oplevel == 'page' and overwrite_separators: + page.set_SeparatorRegion([]) + else: + ignore.extend(page.get_SeparatorRegion()) + # prepare reading order + reading_order = dict() + ro = page.get_ReadingOrder() + if ro: + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + if rogroup: + page_get_reading_order(reading_order, rogroup) - # aggregate existing regions so their foreground can be ignored - ignore = (page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - if oplevel == 'page' and overwrite_separators: - page.set_SeparatorRegion([]) - else: - ignore.extend(page.get_SeparatorRegion()) - # prepare reading order - reading_order = dict() - ro = page.get_ReadingOrder() - if ro: - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - if rogroup: - page_get_reading_order(reading_order, rogroup) - - # get segments to process / overwrite - if oplevel == 'page': - ignore.extend(page.get_TableRegion()) - regions = list(page.get_TextRegion()) - if regions: - # page is already region-segmented - if overwrite_regions: - LOG.info('removing existing TextRegions in page "%s"', page_id) - # we could remove all other region types as well, - # but this is more flexible (for workflows with - # specialized separator/image/table detectors): - page.set_TextRegion([]) - page.set_ReadingOrder(None) - ro = None - else: - LOG.warning('keeping existing TextRegions in page "%s"', page_id) - ignore.extend(regions) - # create reading order if necessary - if not ro or overwrite_order: - ro = ReadingOrderType() - page.set_ReadingOrder(ro) - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - if not rogroup: - # new top-level group - rogroup = OrderedGroupType(id="reading-order") - ro.set_OrderedGroup(rogroup) - # go get TextRegions with TextLines (and SeparatorRegions): - self._process_element(page, ignore, page_image, page_coords, - page_id, file_id, - input_file.pageId, zoom, rogroup=rogroup) - if (not rogroup.get_RegionRefIndexed() and + # get segments to process / overwrite + if oplevel == 'page': + ignore.extend(page.get_TableRegion()) + regions = list(page.get_TextRegion()) + if regions: + # page is already region-segmented + if overwrite_regions: + self.logger.info(f'Removing existing TextRegions in page "{page_id}"', ) + # we could remove all other region types as well, + # but this is more flexible (for workflows with + # specialized separator/image/table detectors): + page.set_TextRegion([]) + page.set_ReadingOrder(None) + ro = None + else: + self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"', ) + ignore.extend(regions) + # create reading order if necessary + if not ro or overwrite_order: + ro = ReadingOrderType() + page.set_ReadingOrder(ro) + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + if not rogroup: + # new top-level group + rogroup = OrderedGroupType(id="reading-order") + ro.set_OrderedGroup(rogroup) + if (not rogroup.get_RegionRefIndexed() and not rogroup.get_OrderedGroupIndexed() and not rogroup.get_UnorderedGroupIndexed()): - # schema forbids empty OrderedGroup - ro.set_OrderedGroup(None) - elif oplevel == 'table': - ignore.extend(page.get_TextRegion()) - regions = list(page.get_TableRegion()) - if not regions: - LOG.warning('Page "%s" contains no table regions', page_id) - for region in regions: - subregions = region.get_TextRegion() - if subregions: - # table is already cell-segmented - if overwrite_regions: - LOG.info('removing existing TextRegions in table "%s"', region.id) - region.set_TextRegion([]) - roelem = reading_order.get(region.id) - # replace by empty group with same index and ref - # (which can then take the cells as subregions) - reading_order[region.id] = page_subgroup_in_reading_order(roelem) - else: - LOG.warning('skipping table "%s" with existing TextRegions', region.id) - continue - # TODO: also allow grayscale_normalized (try/except?) - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - # ignore everything but the current table region - subignore = regions + ignore - subignore.remove(region) - # create reading order group if necessary - roelem = reading_order.get(region.id) - if not roelem: - LOG.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", - page_id, region.id, "no target to add cells to") - elif overwrite_order: - # replace by empty ordered group with same (index and) ref + # schema forbids empty OrderedGroup + ro.set_OrderedGroup(None) + # go get TextRegions with TextLines (and SeparatorRegions): + image = self._process_element(page, ignore, page_image, page_coords, zoom=zoom, rogroup=rogroup) + if image: + result.images.append(image) + return result + + if oplevel == 'table': + ignore.extend(page.get_TextRegion()) + regions = list(page.get_TableRegion()) + if not regions: + self.logger.warning(f'Page "{page_id}" contains no table regions') + for region in regions: + subregions = region.get_TextRegion() + if subregions: + # table is already cell-segmented + if overwrite_regions: + self.logger.info(f'Removing existing TextRegions in table "{region.id}"') + region.set_TextRegion([]) + roelem = reading_order.get(region.id) + # replace by empty group with same index and ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem) - reading_order[region.id] = roelem - elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - LOG.warning("Page '%s' table region '%s' already has an ordered group (%s)", - page_id, region.id, "cells will be appended") - elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - LOG.warning("Page '%s' table region '%s' already has an unordered group (%s)", - page_id, region.id, "cells will not be appended") - roelem = None + reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) else: - # replace regionRef(Indexed) by group with same index and ref - # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem) - reading_order[region.id] = roelem - # go get TextRegions with TextLines (and SeparatorRegions) - self._process_element(region, subignore, region_image, region_coords, - region.id, file_id + '_' + region.id, - input_file.pageId, zoom, rogroup=roelem) - else: # 'region' - regions = list(page.get_TextRegion()) - # besides top-level text regions, line-segment any table cells, - # and for tables without any cells, add a pseudo-cell - for region in page.get_TableRegion(): - subregions = region.get_TextRegion() - if subregions: - regions.extend(subregions) + self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions') + continue + # TODO: also allow grayscale_normalized (try/except?) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + # ignore everything but the current table region + subignore = regions + ignore + subignore.remove(region) + # create reading order group if necessary + roelem = reading_order.get(region.id) + if not roelem: + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' is not referenced in reading order " + f"(no target to add cells to)") + elif overwrite_order: + # replace by empty ordered group with same (index and) ref + # (which can then take the cells as subregions) + roelem = page_subgroup_in_reading_order(self.logger, roelem) + reading_order[region.id] = roelem + elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' already has an ordered group " + f"(cells will be appended)") + elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' already has an unordered group " + f"(cells will not be appended)") + roelem = None + else: + # replace regionRef(Indexed) by group with same index and ref + # (which can then take the cells as subregions) + roelem = page_subgroup_in_reading_order(self.logger, roelem) + reading_order[region.id] = roelem + # go get TextRegions with TextLines (and SeparatorRegions) + image = self._process_element( + region, subignore, region_image, region_coords, zoom=zoom, rogroup=roelem) + if image: + result.images.append(image) + else: # 'region' + regions = list(page.get_TextRegion()) + # besides top-level text regions, line-segment any table cells, + # and for tables without any cells, add a pseudo-cell + for region in page.get_TableRegion(): + subregions = region.get_TextRegion() + if subregions: + regions.extend(subregions) + else: + subregion = TextRegionType( + id=f'{region.id}_text', Coords=region.get_Coords(), parent_object_=region) + region.add_TextRegion(subregion) + regions.append(subregion) + if not regions: + self.logger.warning(f'Page "{page_id}" contains no text regions') + for region in regions: + if region.get_TextLine(): + if overwrite_lines: + self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"') + region.set_TextLine([]) else: - subregion = TextRegionType(id=region.id + '_text', - Coords=region.get_Coords(), - # as if generated from parser: - parent_object_=region) - region.add_TextRegion(subregion) - regions.append(subregion) - if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) - for region in regions: - if region.get_TextLine(): - if overwrite_lines: - LOG.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) - region.set_TextLine([]) - else: - LOG.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) - ignore.extend(region.get_TextLine()) - # TODO: also allow grayscale_normalized (try/except?) - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - # if the region images have already been clipped against their neighbours specifically, - # then we don't need to suppress all neighbours' foreground generally here - if 'clipped' in region_coords['features'].split(','): - ignore = [] - # go get TextLines - self._process_element(region, ignore, region_image, region_coords, - region.id, file_id + '_' + region.id, - input_file.pageId, zoom) - - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"') + ignore.extend(region.get_TextLine()) + # TODO: also allow grayscale_normalized (try/except?) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + # if the region images have already been clipped against their neighbours specifically, + # then we don't need to suppress all neighbours' foreground generally here + if 'clipped' in region_coords['features'].split(','): + ignore = [] + # go get TextLines + image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom) + if image: + result.images.append(image) + return result - def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): + def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]: """Add PAGE layout elements by segmenting an image. Given a PageType, TableRegionType or TextRegionType ``element``, and a corresponding binarized PIL.Image object ``image`` with coordinate metadata ``coords``, run line segmentation with Ocropy. - + If operating on the full page (or table), then also detect horizontal and vertical separators, and aggregate the lines into text regions afterwards. - + Add the resulting sub-segments to the parent ``element``. - + If ``ignore`` is not empty, then first suppress all foreground components in any of those segments' coordinates during segmentation, and if also in full page/table mode, then combine all separators among them with the newly detected separators to guide region segmentation. """ - LOG = getLogger('processor.OcropySegment') if not image.width or not image.height: - LOG.warning("Skipping '%s' with zero size", element_id) - return + self.logger.warning(f"Skipping '{element.id}' with zero size") + return None element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), bool) sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - LOG.debug('masking foreground of %s "%s" for "%s"', - type(segment).__name__[:-4], segment.id, element_id) + self.logger.debug( + f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element.id}"') # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: @@ -554,17 +524,16 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does # not need to concern herself with this. + sp_row = segment_polygon[:, 1] + sp_col = segment_polygon[:, 0] if isinstance(segment, SeparatorRegionType): - sep_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - sep_bin.shape)] = True - ignore_labels[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - ignore_labels.shape)] = i+1 # mapped back for RO + sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True + ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True report = check_page(element_bin, zoom) + suffix = '.IMG-CLIP' elif isinstance(element, TableRegionType) or ( # sole/congruent text region of a table region? element.id.endswith('_text') and @@ -572,11 +541,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_name = 'table' fullpage = True report = check_region(element_bin, zoom) + suffix = f"{element.id}.IMG-CLIP" else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - LOG.info('computing line segmentation for %s "%s"', element_name, element_id) + suffix = f"{element.id}.IMG-CLIP" + element_name_id = f'{element_name} "{element.id}"' + self.logger.info(f'Computing line segmentation for {element_name_id}') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -584,9 +556,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, line_labels, baselines, seplines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): - element_bin, seps=(sep_bin+ignore_labels)>0, + element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, - spread_dist=round(self.parameter['spread']/zoom*300/72), # in pt + spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], @@ -594,16 +566,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): - LOG.error('Cannot line-segment region "%s": %s', element_id, err) + self.logger.error(f'Cannot line-segment region "{element.id}": {err}') # as a fallback, add a single text line comprising the whole region: - element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords())) + element.add_TextLine(TextLineType(id=f"{element.id}_line", Coords=element.get_Coords())) else: - LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) - return + self.logger.error(f'Cannot line-segment {element_name_id}: {err}') + return None - LOG.info('Found %d text lines for %s "%s"', - len(np.unique(line_labels)) - 1, - element_name, element_id) + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name_id}') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -612,31 +582,29 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # i.e. identical line and region labels # to detect their reading order among the others # (these cannot be split or grouped together with other regions) - line_labels = np.where(line_labels, line_labels+len(ignore), ignore_labels) + line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels) # suppress separators/images in fg and try to use for partitioning slices sepmask = np.maximum(sep_bin, np.maximum(seplines > 0, images > 0)) region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, - sepmask=np.maximum(sepmask, colseps), # add bg + sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - LOG.info('Found %d text regions for %s "%s"', - len(np.unique(region_labels)) - 1, - element_name, element_id) + self.logger.info( + f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name_id}') except Exception as err: - LOG.error('Cannot region-segment %s "%s": %s', - element_name, element_id, err) + self.logger.error(f'Cannot region-segment {element_name_id}: {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) - + # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): index = 0 - # start counting from largest existing index + # start counting from the largest existing index for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): @@ -648,7 +616,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_no = 0 for region_label in np.unique(region_labels): if not region_label: - continue # no bg + continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) @@ -657,13 +625,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (no new region, no actual text lines) region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ - "region label %d has both existing regions and new lines (%s)" % ( - region_label, str(region_line_labels0)) + (f'Region label "{region_label}" has both existing regions and new lines ' + f'({str(region_line_labels0)})') region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) - LOG.debug('Region label %d is for ignored region "%s"', - region_label, region.id) + self.logger.debug(f'Region label "{region_label}" is for ignored region "{region.id}"') continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally @@ -671,18 +638,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps - region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, - seps=np.maximum(sepmask, colseps)) + region_line_labels = hmerge_line_seeds( + element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(region_mask * region_label, None, element_bin, - '%s "%s"' % (element_name, element_id), - min_area=6000/zoom/zoom, - simplify=ignore_labels * ~(sep_bin)) + regions, _ = masks2polygons( + self.logger, region_mask * region_label, None, element_bin, name=element_name_id, + min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(region_line_labels, baselines, element_bin, - 'region "%s"' % element_id, - min_area=640/zoom/zoom) + lines, _ = masks2polygons( + self.logger, region_line_labels, baselines, element_bin, name=f'region "{element.id}"', + min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -691,34 +657,31 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_polygon = coordinates_for_segment(region_polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for region label %d', region_label) + self.logger.warning(f'Ignoring extant region contour for region label {region_label}') continue # annotate result: region_no += 1 - region_id = element_id + "_region%04d" % region_no - LOG.debug('Region label %d becomes ID "%s"', region_label, region_id) - region = TextRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon))) + region_id = f"{element.id}_region%04d" % region_no + self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"') + region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): - if not region_poly.intersects(line_poly): # .contains + if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: - LOG.warning('Ignoring extant line contour for region label %d line label %d', - region_label, line_label) + self.logger.warning( + f'Ignoring extant line contour for region label {region_label} line label {line_label}') continue # annotate result: line_no += 1 - line_id = region_id + "_line%04d" % line_no - LOG.debug('Line label %d becomes ID "%s"', line_label, line_id) - line = TextLineType(id=line_id, - Coords=CoordsType(points=points_from_polygon(line_polygon))) + line_id = f"{region_id}_line%04d" % line_no + self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"') + line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if line_baseline: line_baseline = coordinates_for_segment(line_baseline, image, coords) line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) @@ -726,102 +689,88 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - LOG.info('Added region "%s" with %d lines for %s "%s"', - region_id, line_no, element_name, element_id) + self.logger.info(f'Added region "{region_id}" with {line_no} lines for {element_name_id}') if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - LOG.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) + self.logger.info(f'Found {images.max()} large image regions for {element_name_id}') # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(images, None, element_bin, - '%s "%s"' % (element_name, element_id)) + image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, name=element_name_id) for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for image label %d', image_label) + self.logger.warning(f'Ignoring extant region contour for image label {image_label}') continue region_no += 1 # annotate result: - region_id = element_id + "_image%04d" % region_no + region_id = f"{element.id}_image%04d" % region_no element.add_ImageRegion(ImageRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon)))) + id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - LOG.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) + self.logger.info(f'Found {seplines.max()} separators for {element_name_id}') # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons(seplines, None, element_bin, - '%s "%s"' % (element_name, element_id), - open_holes=True, reorder=False) + sep_polygons, _ = masks2polygons( + self.logger, seplines, None, element_bin, name=element_name_id, open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for separator %d', sep_label) + self.logger.warning(f'Ignoring extant region contour for separator {sep_label}') continue # annotate result: region_no += 1 - region_id = element_id + "_sep%04d" % region_no + region_id = f"{element.id}_sep%04d" % region_no element.add_SeparatorRegion(SeparatorRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon)))) + id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image - element_array[sepmask] = np.amax(element_array) # clip to white/bg + element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) - file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) - element.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=coords['features'] + ',clipped')) + image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') + element.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_clipped, suffix, image_ref) else: # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - region_mask.shape)] = True + region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(line_labels, baselines, element_bin, - 'region "%s"' % element_id, - min_area=640/zoom/zoom) + line_polygons, _ = masks2polygons( + self.logger, line_labels, baselines, element_bin, + name=f'region "{element.id}"', min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: - LOG.warning('Ignoring extant line contour for line label %d', - line_label) + self.logger.warning(f'Ignoring extant line contour for line label {line_label}') continue # annotate result: line_no += 1 - line_id = element_id + "_line%04d" % line_no - line = TextLineType(id=line_id, - Coords=CoordsType(points=points_from_polygon(line_polygon))) + line_id = f"{element.id}_line%04d" % line_no + line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if baseline: line_baseline = coordinates_for_segment(baseline, image, coords) line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) element.add_TextLine(line) if not sep_bin.any(): - return # no derived image + return None # no derived image # annotate a text/image-separated image - element_array[sep_bin] = np.amax(element_array) # clip to white/bg + element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) - file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) - # update PAGE (reference the image file): - element.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=coords['features'] + ',clipped')) + image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') + element.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_clipped, suffix, image_ref) def polygon_for_parent(polygon, parent): """Clip polygon to parent polygon range. - + (Should be moved to ocrd_utils.coordinates_for_segment.) """ childp = Polygon(polygon) @@ -915,11 +864,12 @@ def join_polygons(polygons, loc='', scale=20): dists[j, i] = dist dists = minimum_spanning_tree(dists, overwrite=True) # add bridge polygons (where necessary) + max_dist = max(1.0, scale / 5) for prevp, nextp in zip(*dists.nonzero()): prevp = polygons[prevp] nextp = polygons[nextp] nearest = nearest_points(prevp, nextp) - bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) + bridgep = LineString(nearest).buffer(max_dist, resolution=1) polygons.append(bridgep) jointp = unary_union(polygons) assert jointp.geom_type == 'Polygon', jointp.wkt @@ -930,8 +880,7 @@ def join_polygons(polygons, loc='', scale=20): jointp = make_valid(jointp) return jointp -def join_baselines(baselines, loc=''): - LOG = getLogger('processor.OcropyResegment') +def join_baselines(logger: Logger, baselines, loc=''): lines = [] for baseline in baselines: if (baseline.is_empty or @@ -948,9 +897,9 @@ def join_baselines(baselines, loc=''): elif geom.geom_type == 'MultiLineString': lines.extend(geom) else: - LOG.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc) + logger.warning(f"Ignoring baseline subtype {geom.geom_type} in {loc}") else: - LOG.warning("ignoring baseline type %s in %s", baseline.geom_type, loc) + logger.warning(f"Ignoring baseline type {baseline.geom_type} in {loc}") nlines = len(lines) if nlines == 0: return None @@ -1012,7 +961,7 @@ def join_baselines(baselines, loc=''): else: chains.append([prevl, nextl]) if len(chains) > 1: - LOG.warning("baseline merge impossible (no spanning tree) in %s", loc) + logger.warning(f"Baseline merge impossible (no spanning tree) in {loc}") return None assert len(chains) == 1, chains assert len(chains[0]) == nlines, chains[0] @@ -1024,7 +973,7 @@ def join_baselines(baselines, loc=''): coords.extend(line.normalize().coords) result = LineString(coords) if result.is_empty: - LOG.warning("baseline merge is empty in %s", loc) + logger.warning(f"Baseline merge is empty in {loc}") return None assert result.geom_type == 'LineString', result.wkt result = set_precision(result, 1.0) @@ -1034,7 +983,7 @@ def join_baselines(baselines, loc=''): def page_get_reading_order(ro, rogroup): """Add all elements from the given reading order group to the given dictionary. - + Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, and an object ``rogroup`` with additional ReadingOrder element objects, add all references to the dict, traversing the group recursively. @@ -1054,10 +1003,10 @@ def page_get_reading_order(ro, rogroup): def page_add_to_reading_order(rogroup, region_id, index=None): """Add a region reference to an un/ordered RO group. - + Given a ReadingOrder group ``rogroup`` (of any type), append a reference to region ``region_id`` to it. - + If ``index`` is given, use that as position and return incremented by one. (This must be an integer if ``rogroup`` is an OrderedGroup(Indexed). @@ -1065,65 +1014,56 @@ def page_add_to_reading_order(rogroup, region_id, index=None): """ if rogroup: if index is None: - rogroup.add_RegionRef(RegionRefType( - regionRef=region_id)) + rogroup.add_RegionRef(RegionRefType(regionRef=region_id)) else: - rogroup.add_RegionRefIndexed(RegionRefIndexedType( - regionRef=region_id, index=index)) + rogroup.add_RegionRefIndexed(RegionRefIndexedType(regionRef=region_id, index=index)) index += 1 return index -def page_subgroup_in_reading_order(roelem): +def page_subgroup_in_reading_order(logger: Logger, roelem): """Replace given RO element by an equivalent OrderedGroup. - + Given a ReadingOrder element ``roelem`` (of any type), first look up its parent group. Remove it from the respective member list (of its region refs or un/ordered groups), even if it already was an OrderedGroup(Indexed). - + Then instantiate an empty OrderedGroup(Indexed), referencing the same region as ``roelem`` (and using the same index, if any). Add that group to the parent instead. - + Return the new group object. """ - LOG = getLogger('processor.OcropySegment') if not roelem: - LOG.error('Cannot subgroup from empty ReadingOrder element') + logger.error('Cannot subgroup from empty ReadingOrder element') return roelem if not roelem.parent_object_: - LOG.error('Cannot subgroup from orphan ReadingOrder element') + logger.error('Cannot subgroup from orphan ReadingOrder element') return roelem - if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not ( + if isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)) and not ( roelem.get_OrderedGroupIndexed() or roelem.get_UnorderedGroupIndexed() or roelem.get_RegionRefIndexed()): # is already a group and still empty return roelem - if isinstance(roelem, (OrderedGroupType, - UnorderedGroupType, - RegionRefType)): + if isinstance(roelem, (OrderedGroupType, UnorderedGroupType, RegionRefType)): getattr(roelem.parent_object_, { OrderedGroupType: 'get_OrderedGroup', UnorderedGroupType: 'get_UnorderedGroup', RegionRefType: 'get_RegionRef', }.get(roelem.__class__))().remove(roelem) - roelem2 = OrderedGroupType(id=roelem.regionRef + '_group', - regionRef=roelem.regionRef) + roelem2 = OrderedGroupType(id=f"{roelem.regionRef}_group", regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroup(roelem2) roelem2.parent_object_ = roelem.parent_object_ return roelem2 - if isinstance(roelem, (OrderedGroupIndexedType, - UnorderedGroupIndexedType, - RegionRefIndexedType)): + if isinstance(roelem, (OrderedGroupIndexedType, UnorderedGroupIndexedType, RegionRefIndexedType)): getattr(roelem.parent_object_, { OrderedGroupIndexedType: 'get_OrderedGroupIndexed', UnorderedGroupIndexedType: 'get_UnorderedGroupIndexed', RegionRefIndexedType: 'get_RegionRefIndexed' }.get(roelem.__class__))().remove(roelem) - roelem2 = OrderedGroupIndexedType(id=roelem.regionRef + '_group', - index=roelem.index, - regionRef=roelem.regionRef) + roelem2 = OrderedGroupIndexedType( + id=f"{roelem.regionRef}_group", index=roelem.index, regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroupIndexed(roelem2) roelem2.parent_object_ = roelem.parent_object_ return roelem2 diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index d257a61f..78302f12 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,13 +1,13 @@ from __future__ import absolute_import -import sys -import os -import tempfile +from typing import Optional +from logging import Logger +from sys import exit +from os import makedirs, remove +from os.path import abspath, dirname, exists, join, isfile -from ocrd_modelfactory import page_from_file -from ocrd import Processor -from ocrd_utils import getLogger -from ocrd_cis import get_ocrd_tool +from ocrd_models import OcrdPage +from ocrd import Processor, Workspace, OcrdPageResult from .ocropus_rtrain import * from .binarize import binarize @@ -15,10 +15,10 @@ def deletefiles(filelist): for file in filelist: - if os.path.exists(file): - os.remove(file) - if os.path.exists(file[:-3]+'gt.txt'): - os.remove(file[:-3]+'gt.txt') + if exists(file): + remove(file) + if exists(file[:-3] + 'gt.txt'): + remove(file[:-3] + 'gt.txt') def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) @@ -28,92 +28,87 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): + modelpath: str + outputpath: str - def __init__(self, *args, **kwargs): - self.oldcwd = os.getcwd() - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-ocropy-train'] - kwargs['version'] = ocrd_tool['version'] - super(OcropyTrain, self).__init__(*args, **kwargs) - if hasattr(self, 'input_file_grp'): - # processing context - self.setup() + @property + def executable(self): + return 'ocrd-cis-ocropy-train' def setup(self): - self.log = getLogger('processor.OcropyTrain') - #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] try: - modelpath = self.resolve_resource(model) + self.modelpath = self.resolve_resource(model) except SystemExit: - ocropydir = os.path.dirname(os.path.abspath(__file__)) - modelpath = os.path.join(ocropydir, 'models', model) - self.log.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) - if not os.path.isfile(modelpath): - self.log.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", - model, model) - sys.exit(1) - outputpath = os.path.join(self.oldcwd, 'output', model) - if 'outputpath' in self.parameter: - outputpath = os.path.join(self.parameter, model) + ocropydir = dirname(abspath(__file__)) + self.modelpath = join(ocropydir, 'models', model) + self.logger.error(f"Failed to resolve model '{model}' path, trying '{self.modelpath}'") + if not isfile(self.modelpath): + self.logger.critical(f"Could not find model '{model}'.\n" + f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'") + exit(1) + self.outputpath = join(self.parameter.get('outputpath', 'output'), model) else: - modelpath = None - outputpath = os.path.join(self.oldcwd, 'output', 'lstm') - if 'outputpath' in self.parameter: - outputpath = os.path.join(self.parameter, 'lstm') - os.makedirs(os.path.dirname(outputpath)) - self.modelpath = modelpath - self.outputpath = outputpath - - def process(self): + self.modelpath = None + self.outputpath = join(self.parameter.get('outputpath', 'output'), 'lstm') + makedirs(dirname(self.outputpath)) + self.filelist = None + + def process_workspace(self, workspace: Workspace) -> None: """ Trains a new model on the text lines from the input fileGrp, - extracted as temporary image-text file pairs. + extracted as image-text file pairs into the output fileGrp. + (If the output fileGrp already exists and these files should + be re-used, pass the `--overwrite` option when processing.) + + The model is written into `outputpath` (or just `output`) under + the same name as `model` (i.e. the start model, or just `lstm`). + """ + self.filelist = [] + super().process_workspace(workspace) + self.logger.info(f"Training {self.outputpath} from {self.modelpath or 'scratch'} " + f"on {len(self.filelist)} file pairs") + rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) + # deletefiles(self.filelist) + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ - filelist = [] - filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-') - #self.log.info("Using model %s in %s for recognition", model) - for (n, input_file) in enumerate(self.input_files): - #self.log.info("INPUT FILE %i / %s", n, input_file) - pcgts = page_from_file(self.workspace.download_file(input_file)) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - - self.log.info("Extracting from page '%s'", page_id) - for region in page.get_AllRegions(classes=['Text']): - textlines = region.get_TextLine() - self.log.info("Extracting %i lines from region '%s'", len(textlines), region.id) - for line in textlines: - if self.parameter['textequiv_level'] == 'line': - path = os.path.join(filepath, page_id + region.id + line.id) - imgpath = self.extract_segment(path, line, page_image, page_coords) - if imgpath: - filelist.append(imgpath) + Extracts pairs of plaintext and cropped image files for each text line + in the PAGE file (to be used during training). + """ + pcgts = input_pcgts[0] + #self.logger.info("Using model %s in %s for recognition", model) + page = pcgts.get_Page() + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + + self.logger.debug(f"Extracting from page '{page_id}'") + for region in page.get_AllRegions(classes=['Text']): + textlines = region.get_TextLine() + self.logger.debug(f"Extracting {len(textlines)} lines from region '{region.id}'") + for line in textlines: + if self.parameter['textequiv_level'] == 'line': + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}") + self.filelist.append(self.extract_segment(path, line, page_image, page_coords)) + continue + for word in line.get_Word(): + if self.parameter['textequiv_level'] == 'word': + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}") + self.filelist.append(self.extract_segment(path, word, page_image, page_coords)) continue - for word in line.get_Word(): - if self.parameter['textequiv_level'] == 'word': - path = os.path.join(filepath, page_id + region.id + line.id + word.id) - imgpath = self.extract_segment(path, word, page_image, page_coords) - if imgpath: - filelist.append(imgpath) - continue - for glyph in word.get_Glyph(): - path = os.path.join(filepath, page_id + region.id + line.id + glyph.id) - imgpath = self.extract_segment(path, glyph, page_image, page_coords) - if imgpath: - filelist.append(imgpath) - - self.log.info("Training %s from %s on %i file pairs", - self.outputpath, - self.modelpath or 'scratch', - len(filelist)) - rtrain(filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) - deletefiles(filelist) + for glyph in word.get_Glyph(): + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}_{glyph.id}") + self.filelist.append(self.extract_segment(path, glyph, page_image, page_coords)) + # FIXME: PAGE-XML not really needed, find a way around this (raising special exception?) + return OcrdPageResult(pcgts) def extract_segment(self, path, segment, page_image, page_coords): - #ground truth + gtpath = path + '.gt.txt' + imgpath = path + '.png' + if exists(gtpath) and exists(imgpath): + self.logger.debug(f"Reusing {segment.__class__.__name__} '{segment.id}' file pair") + return imgpath + gt = segment.TextEquiv if not gt: return None @@ -121,22 +116,19 @@ def extract_segment(self, path, segment, page_image, page_coords): if not gt or not gt.strip(): return None gt = gt.strip() - gtpath = path + '.gt.txt' with open(gtpath, "w", encoding='utf-8') as f: f.write(gt) - self.log.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id) + self.logger.debug(f"Extracting {segment.__class__.__name__} '{segment.id}' file pair") image, coords = self.workspace.image_from_segment(segment, page_image, page_coords) if 'binarized' not in coords['features'].split(','): # binarize with nlbin - image, _ = binarize(image, maxskew=0) + image, _ = binarize(self.logger, image, maxskew=0) # resize image to 48 pixel height image = resize_keep_ratio(image) - #save temp image - imgpath = path + '.png' image.save(imgpath) return imgpath diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index dc3ee48e..70918de7 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -1,14 +1,16 @@ from __future__ import absolute_import -import click -import json import os -from ocrd import Processor -from ocrd.decorators import ocrd_cli_options -from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import getLogger, getLevelName -from ocrd_models.ocrd_mets import OcrdMets +import json + +import click + +from ocrd import Processor, Workspace +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor +from ocrd_utils import getLevelName, pushd_popd +from ocrd_models import OcrdMets + from ocrd_cis import JavaPostCorrector -from ocrd_cis import get_ocrd_tool + @click.command() @ocrd_cli_options @@ -16,33 +18,55 @@ def ocrd_cis_postcorrect(*args, **kwargs): return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs) class PostCorrector(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect'] - kwargs['version'] = ocrd_tool['version'] - super(PostCorrector, self).__init__(*args, **kwargs) - - def process(self): - self.log = getLogger('processor.CISPostCorrector') + @property + def executable(self): + return 'ocrd-cis-postcorrect' + + def setup(self): + # since ocrd v3.0 we cannot overwrite self.parameter anymore + # because that gets validated against the schema + # (so these additions would fail) + self.params = dict(self.parameter) profiler = {} profiler["path"] = self.parameter["profilerPath"] profiler["config"] = self.parameter["profilerConfig"] profiler["noCache"] = True - self.parameter["profiler"] = profiler - self.parameter["runDM"] = True - self.log.debug(json.dumps(self.parameter, indent=4)) - p = JavaPostCorrector(self.workspace.mets_target, - self.input_file_grp, - self.output_file_grp, - self.parameter, - getLevelName(self.log.getEffectiveLevel())) - p.exe() - # reload the mets file to prevent run_processor's save_mets - # from overriding the results from the Java process - self.workspace.reload_mets() - # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): - for output_file in self.workspace.find_files(file_grp=self.output_file_grp): - flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') - flocat.attrib['LOCTYPE'] = 'OTHER' - flocat.attrib['OTHERLOCTYPE'] = 'FILE' - output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + self.params["profiler"] = profiler + self.params["runDM"] = True + self.logger.debug(json.dumps(self.params, indent=4)) + + def process_workspace(self, workspace: Workspace): + with pushd_popd(workspace.directory): + self.workspace = workspace + self.verify() + # ensure that input files are referenced in on-disk METS + self.workspace.save_mets() + # this CLI call mimics the OCR-D processor CLI itself + # we have no control over its interior + # (we get no page-wise error handling and input downloading) + p = JavaPostCorrector(self.workspace.mets_target, + self.input_file_grp, + self.output_file_grp, + self.params, + getLevelName(self.logger.getEffectiveLevel())) + p.exe() + # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): + # We cannot do that with this method, because our self.workspace.mets might be + # a ClientSideOcrdMets, which does not allow modifying or removing files: + # for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + # flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + # flocat.attrib['LOCTYPE'] = 'OTHER' + # flocat.attrib['OTHERLOCTYPE'] = 'FILE' + # output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + # So instead, let's post-process the local METS file result directly: + mets = OcrdMets(filename=self.workspace.mets_target) + for output_file in mets.find_files(fileGrp=self.output_file_grp): + flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + flocat.attrib['LOCTYPE'] = 'OTHER' + flocat.attrib['OTHERLOCTYPE'] = 'FILE' + output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + with open(self.workspace.mets_target, 'w') as f: + f.write(mets.to_xml(xmllint=True).decode('utf-8')) + # reload the mets file to prevent run_processor's save_mets + # from overriding the results from the Java process + self.workspace.reload_mets() diff --git a/setup.py b/setup.py index 6df9445c..e8ea1cf3 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=2.47', + 'ocrd>=3.0.0b4', 'click', 'scipy', 'numpy>=1.17.0', diff --git a/tests/run_add_zip_test.bash b/tests/run_add_zip_test.bash index 02de2db2..e2d44983 100644 --- a/tests/run_add_zip_test.bash +++ b/tests/run_add_zip_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-GT-SEG-LINE); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -16,9 +16,10 @@ popd # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-IMG); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-IMG); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" popd + diff --git a/tests/run_alignment_test.bash b/tests/run_alignment_test.bash index e8a3c79a..7a82254b 100644 --- a/tests/run_alignment_test.bash +++ b/tests/run_alignment_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -17,9 +17,10 @@ ocrd_cis_align pushd $tmpws found_files=0 -for file in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-ALIGN); do [[ -f "$file" ]] || fail "cannot find aligned file group workspace" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" popd + diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_image_preprocessing_test.bash index f80fc636..7a66a57b 100644 --- a/tests/run_image_preprocessing_test.bash +++ b/tests/run_image_preprocessing_test.bash @@ -7,16 +7,17 @@ ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip" # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" -ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN -ocrd-cis-ocropy-clip -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP -ocrd-cis-ocropy-denoise -l DEBUG -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN -ocrd-cis-ocropy-deskew -l DEBUG -I OCR-D-CIS-IMG-DEN -O OCR-D-CIS-IMG-DES -ocrd-cis-ocropy-dewarp -l DEBUG -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW -ocrd-cis-ocropy-segment -l DEBUG -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN +ocrd-cis-ocropy-clip ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP +ocrd-cis-ocropy-denoise ${ARGS[*]} -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN +ocrd-cis-ocropy-deskew ${ARGS[*]} -I OCR-D-CIS-IMG-DEN -O OCR-D-CIS-IMG-DES +ocrd-cis-ocropy-dewarp ${ARGS[*]} -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW +ocrd-cis-ocropy-segment ${ARGS[*]} -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG popd diff --git a/tests/run_ocr_test.bash b/tests/run_ocr_test.bash index b10f6f6d..f737ae43 100644 --- a/tests/run_ocr_test.bash +++ b/tests/run_ocr_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -16,8 +16,9 @@ done ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz # run ocr -ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN -ocrd-cis-ocropy-recognize -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \ +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN +ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \ -P textequiv_level word -P model fraktur.pyrnn.gz popd diff --git a/tests/run_postcorrection_test.bash b/tests/run_postcorrection_test.bash index d7f34ace..859c8407 100644 --- a/tests/run_postcorrection_test.bash +++ b/tests/run_postcorrection_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -15,25 +15,26 @@ popd ocrd_cis_align -mkdir "$tmpdir/bin" -cat > "$tmpdir/bin/profiler.bash" < "bin/profiler.bash" < /dev/null echo '{}' EOF -chmod a+x "$tmpdir/bin/profiler.bash" -ocrd-cis-postcorrect -l DEBUG \ +chmod a+x "bin/profiler.bash" + +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-postcorrect ${ARGS[*]} \ -I OCR-D-CIS-ALIGN \ -O OCR-D-CIS-POSTCORRECT \ - -m $tmpws/mets.xml \ - -P profilerPath $tmpdir/bin/profiler.bash \ + -P profilerPath bin/profiler.bash \ -P profilerConfig ignored \ -P model "$(ocrd-cis-data -model)" \ -P nOCR 2 -pushd $tmpws found_files=0 -for file in $(ocrd workspace find -G OCR-D-CIS-POSTCORRECT); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-POSTCORRECT); do [[ -f "$file" ]] || fail "$file: not a file" found_files=$((found_files + 1)) done diff --git a/tests/run_training_test.bash b/tests/run_training_test.bash index ade1b68e..5b96dc3e 100644 --- a/tests/run_training_test.bash +++ b/tests/run_training_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -15,9 +15,12 @@ popd ocrd_cis_align +stopserver +OCRD_MAX_PARALLEL_PAGES=1 + # fix ocr for some entries (otherwise the training will fail) pushd $tmpws -for f in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do +for f in $(ocrd ${OCRD_LOG_ARGS[*]} workspace find -G OCR-D-CIS-ALIGN); do sed -i -e 's#e.#Säugethiere.#' $f sed -i -e 's#E#Säugethieren#' $f done diff --git a/tests/test_lib.bash b/tests/test_lib.bash index f28acb1e..76111d25 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -1,11 +1,28 @@ #/bin/bash tmpdir=$(mktemp -d) -trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR -trap "rm -rf $tmpdir" EXIT +function stopserver() { + : +} +function failexit() { + stopserver +} +function cleanexit() { + stopserver + rm -rf $tmpdir +} +trap "trap failexit EXIT" ERR +trap cleanexit EXIT + +OCRD_LOG_ARGS=() +if test -v OCRD_OVERRIDE_LOGLEVEL; then + OCRD_LOG_ARGS+=(-l $OCRD_OVERRIDE_LOGLEVEL) +fi +OCRD_WS_ARGS=() # -m mets.xml OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/" + +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" @@ -16,22 +33,32 @@ function ocrd_cis_init_ws() { ocrd_cis_download_bagit "$1" ocrd zip spill -d "$tmpdir" "$PWD/download/$1" tmpws="$tmpdir/${1%.ocrd.zip}" + if ((${OCRD_MAX_PARALLEL_PAGES:-0} > 1)); then + echo starting METS server at $tmpws + ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server start & + OCRD_WS_ARGS+=(-U "$tmpws/mets.sock") + sleep 1 + function stopserver() { + echo stopping METS server at $tmpws + ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server stop || true + } + fi } + function ocrd_cis_align() { # download ocr models ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz # run ocr pushd $tmpws - ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN - ocrd-cis-ocropy-recognize -l DEBUG \ - -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \ + ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) + ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN + ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \ -P textequiv_level word -P model fraktur.pyrnn.gz - ocrd-cis-ocropy-recognize -l DEBUG \ - -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \ + ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \ -P textequiv_level word -P model fraktur-jze.pyrnn.gz - ocrd-cis-align -l DEBUG -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \ + ocrd-cis-align ${ARGS[*]} -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \ -O OCR-D-CIS-ALIGN popd }