diff --git a/ocrd_page_to_alto/cli.py b/ocrd_page_to_alto/cli.py index 9c796a9..abd0db0 100644 --- a/ocrd_page_to_alto/cli.py +++ b/ocrd_page_to_alto/cli.py @@ -1,8 +1,12 @@ import click from .convert import OcrdPageAltoConverter from ocrd_utils import initLogging +from ocrd.decorators import ocrd_loglevel -@click.command() +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) + +@click.command(context_settings=CONTEXT_SETTINGS) +@ocrd_loglevel @click.option('--check-words/--no-check-words', default=True, help='Check whether PAGE-XML contains any Words and fail if not') @click.option('--check-border/--no-check-border', default=True, help='Check whether PAGE-XML contains Border or PrintSpace') @click.option('--skip-empty-lines/--no-skip-empty-lines', default=False, help='Whether to omit or keep empty lines in PAGE-XML') @@ -11,8 +15,10 @@ @click.option('--dummy-word/--no-dummy-word', default=True, help='Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word') @click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index') @click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element") -@click.argument('filename') -def main(check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, filename): +@click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output, the default)', + type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True)) +@click.argument('filename', type=click.Path(dir_okay=False, exists=True)) +def main(log_level, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, output_file, filename): """ Convert PAGE to ALTO """ @@ -29,7 +35,8 @@ def main(check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dumm textequiv_fallback_strategy=textequiv_fallback_strategy ) converter.convert() - print(converter) + with open(1 if output_file == '-' else output_file, 'w') as output: + output.write(str(converter)) if __name__ == '__main__': main() # pylint: disable=no-value-for-parameter diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py index 64245a7..5dc93e0 100644 --- a/ocrd_page_to_alto/convert.py +++ b/ocrd_page_to_alto/convert.py @@ -283,7 +283,7 @@ def _convert_table(self, parent_alto, parent_page, level=0): self._convert_textlines(textblock_alto, parent_page) def convert_text(self): - for reg_page in self.page_page.get_AllRegions(depth=1): + for reg_page in self.page_page.get_AllRegions(depth=0): reg_page_type = reg_page.__class__.__name__[0:-10] # len('RegionType') == 10 reg_alto_type = REGION_PAGE_TO_ALTO[reg_page_type] if not reg_alto_type: diff --git a/ocrd_page_to_alto/styles.py b/ocrd_page_to_alto/styles.py index 0645b8b..2e0db62 100644 --- a/ocrd_page_to_alto/styles.py +++ b/ocrd_page_to_alto/styles.py @@ -122,6 +122,7 @@ def __init__(self): def set_alto_tag_from_type(self, reg_alto, reg_page): typ = reg_page.get_type() if hasattr(reg_page, 'get_type') else None if typ: - reg_alto.set('TYPE', typ) + if hasattr(reg_alto, 'TYPE'): + reg_alto.set('TYPE', typ) reg_alto.set('TAGREFS', self.get_id(label=typ))