From 8867648fd64d209972fa189b57721dc5b484586d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Jun 2021 19:04:56 +0200 Subject: [PATCH 1/5] TextBlock has no @TYPE --- ocrd_page_to_alto/styles.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_page_to_alto/styles.py b/ocrd_page_to_alto/styles.py index 0645b8b..2e0db62 100644 --- a/ocrd_page_to_alto/styles.py +++ b/ocrd_page_to_alto/styles.py @@ -122,6 +122,7 @@ def __init__(self): def set_alto_tag_from_type(self, reg_alto, reg_page): typ = reg_page.get_type() if hasattr(reg_page, 'get_type') else None if typ: - reg_alto.set('TYPE', typ) + if hasattr(reg_alto, 'TYPE'): + reg_alto.set('TYPE', typ) reg_alto.set('TAGREFS', self.get_id(label=typ)) From a386fa7dcea6645e2615a87b8e6234ea4bc210ad Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Jun 2021 19:05:50 +0200 Subject: [PATCH 2/5] get text regions with arbitrary recursion depth --- ocrd_page_to_alto/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py index 64245a7..5dc93e0 100644 --- a/ocrd_page_to_alto/convert.py +++ b/ocrd_page_to_alto/convert.py @@ -283,7 +283,7 @@ def _convert_table(self, parent_alto, parent_page, level=0): self._convert_textlines(textblock_alto, parent_page) def convert_text(self): - for reg_page in self.page_page.get_AllRegions(depth=1): + for reg_page in self.page_page.get_AllRegions(depth=0): reg_page_type = reg_page.__class__.__name__[0:-10] # len('RegionType') == 10 reg_alto_type = REGION_PAGE_TO_ALTO[reg_page_type] if not reg_alto_type: From 6b9bf28e1792a6bbc987e3a30e461649eae3a3c5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Jun 2021 19:09:39 +0200 Subject: [PATCH 3/5] add loglevel option --- ocrd_page_to_alto/cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_page_to_alto/cli.py b/ocrd_page_to_alto/cli.py index 9c796a9..4e9f5b2 100644 --- a/ocrd_page_to_alto/cli.py +++ b/ocrd_page_to_alto/cli.py @@ -1,8 +1,10 @@ import click from .convert import OcrdPageAltoConverter from ocrd_utils import initLogging +from ocrd.decorators import ocrd_loglevel @click.command() +@ocrd_loglevel @click.option('--check-words/--no-check-words', default=True, help='Check whether PAGE-XML contains any Words and fail if not') @click.option('--check-border/--no-check-border', default=True, help='Check whether PAGE-XML contains Border or PrintSpace') @click.option('--skip-empty-lines/--no-skip-empty-lines', default=False, help='Whether to omit or keep empty lines in PAGE-XML') @@ -12,7 +14,7 @@ @click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index') @click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element") @click.argument('filename') -def main(check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, filename): +def main(log_level, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, filename): """ Convert PAGE to ALTO """ From db5a9b8d2c87ee58ea000ebe321f7f0c80b25aeb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Jun 2021 22:35:42 +0200 Subject: [PATCH 4/5] add output-file option --- ocrd_page_to_alto/cli.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ocrd_page_to_alto/cli.py b/ocrd_page_to_alto/cli.py index 4e9f5b2..25076ef 100644 --- a/ocrd_page_to_alto/cli.py +++ b/ocrd_page_to_alto/cli.py @@ -13,8 +13,10 @@ @click.option('--dummy-word/--no-dummy-word', default=True, help='Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word') @click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index') @click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element") -@click.argument('filename') -def main(log_level, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, filename): +@click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output, the default)', + type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True)) +@click.argument('filename', type=click.Path(dir_okay=False, exists=True)) +def main(log_level, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, output_file, filename): """ Convert PAGE to ALTO """ @@ -31,7 +33,8 @@ def main(log_level, check_words, check_border, skip_empty_lines, trailing_dash_t textequiv_fallback_strategy=textequiv_fallback_strategy ) converter.convert() - print(converter) + with open(1 if output_file == '-' else output_file, 'w') as output: + output.write(str(converter)) if __name__ == '__main__': main() # pylint: disable=no-value-for-parameter From 73a08b9a64c851108ac38e9a6378c841924cfd26 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 14 Jun 2021 22:36:43 +0200 Subject: [PATCH 5/5] add -h shorthand for --help --- ocrd_page_to_alto/cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd_page_to_alto/cli.py b/ocrd_page_to_alto/cli.py index 25076ef..abd0db0 100644 --- a/ocrd_page_to_alto/cli.py +++ b/ocrd_page_to_alto/cli.py @@ -3,7 +3,9 @@ from ocrd_utils import initLogging from ocrd.decorators import ocrd_loglevel -@click.command() +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + +@click.command(context_settings=CONTEXT_SETTINGS) @ocrd_loglevel @click.option('--check-words/--no-check-words', default=True, help='Check whether PAGE-XML contains any Words and fail if not') @click.option('--check-border/--no-check-border', default=True, help='Check whether PAGE-XML contains Border or PrintSpace')