Skip to content

Commit

Permalink
Merge pull request #13 from bertsky/fixes
Browse files Browse the repository at this point in the history
Various improvements
  • Loading branch information
kba authored Jun 15, 2021
2 parents 4f1e93d + 73a08b9 commit 92b1657
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 6 deletions.
15 changes: 11 additions & 4 deletions ocrd_page_to_alto/cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import click
from .convert import OcrdPageAltoConverter
from ocrd_utils import initLogging
from ocrd.decorators import ocrd_loglevel

@click.command()
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])

@click.command(context_settings=CONTEXT_SETTINGS)
@ocrd_loglevel
@click.option('--check-words/--no-check-words', default=True, help='Check whether PAGE-XML contains any Words and fail if not')
@click.option('--check-border/--no-check-border', default=True, help='Check whether PAGE-XML contains Border or PrintSpace')
@click.option('--skip-empty-lines/--no-skip-empty-lines', default=False, help='Whether to omit or keep empty lines in PAGE-XML')
Expand All @@ -11,8 +15,10 @@
@click.option('--dummy-word/--no-dummy-word', default=True, help='Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word')
@click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index')
@click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element")
@click.argument('filename')
def main(check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, filename):
@click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output, the default)',
type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True))
@click.argument('filename', type=click.Path(dir_okay=False, exists=True))
def main(log_level, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, output_file, filename):
"""
Convert PAGE to ALTO
"""
Expand All @@ -29,7 +35,8 @@ def main(check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dumm
textequiv_fallback_strategy=textequiv_fallback_strategy
)
converter.convert()
print(converter)
with open(1 if output_file == '-' else output_file, 'w') as output:
output.write(str(converter))

if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter
2 changes: 1 addition & 1 deletion ocrd_page_to_alto/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def _convert_table(self, parent_alto, parent_page, level=0):
self._convert_textlines(textblock_alto, parent_page)

def convert_text(self):
for reg_page in self.page_page.get_AllRegions(depth=1):
for reg_page in self.page_page.get_AllRegions(depth=0):
reg_page_type = reg_page.__class__.__name__[0:-10] # len('RegionType') == 10
reg_alto_type = REGION_PAGE_TO_ALTO[reg_page_type]
if not reg_alto_type:
Expand Down
3 changes: 2 additions & 1 deletion ocrd_page_to_alto/styles.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def __init__(self):
def set_alto_tag_from_type(self, reg_alto, reg_page):
typ = reg_page.get_type() if hasattr(reg_page, 'get_type') else None
if typ:
reg_alto.set('TYPE', typ)
if hasattr(reg_alto, 'TYPE'):
reg_alto.set('TYPE', typ)
reg_alto.set('TAGREFS', self.get_id(label=typ))

0 comments on commit 92b1657

Please sign in to comment.