Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various improvements #13

Merged
merged 5 commits into from
Jun 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions ocrd_page_to_alto/cli.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import click
from .convert import OcrdPageAltoConverter
from ocrd_utils import initLogging
from ocrd.decorators import ocrd_loglevel

@click.command()
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])

@click.command(context_settings=CONTEXT_SETTINGS)
@ocrd_loglevel
@click.option('--check-words/--no-check-words', default=True, help='Check whether PAGE-XML contains any Words and fail if not')
@click.option('--check-border/--no-check-border', default=True, help='Check whether PAGE-XML contains Border or PrintSpace')
@click.option('--skip-empty-lines/--no-skip-empty-lines', default=False, help='Whether to omit or keep empty lines in PAGE-XML')
Expand All @@ -11,8 +15,10 @@
@click.option('--dummy-word/--no-dummy-word', default=True, help='Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word')
@click.option('--textequiv-index', default=0, help='If multiple textequiv, use the n-th TextEquiv by @index')
@click.option('--textequiv-fallback-strategy', default='last', type=click.Choice(['raise', 'first', 'last']), help="What to do if nth textequiv isn't available. 'raise' will lead to a runtime error, 'first' will use the first TextEquiv, 'last' will use the last TextEquiv on the element")
@click.argument('filename')
def main(check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, filename):
@click.option('-O', '--output-file', default='-', help='Output filename (or "-" for standard output, the default)',
type=click.Path(dir_okay=False, writable=True, exists=False, allow_dash=True))
@click.argument('filename', type=click.Path(dir_okay=False, exists=True))
def main(log_level, check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dummy_textline, dummy_word, textequiv_index, textequiv_fallback_strategy, output_file, filename):
"""
Convert PAGE to ALTO
"""
Expand All @@ -29,7 +35,8 @@ def main(check_words, check_border, skip_empty_lines, trailing_dash_to_hyp, dumm
textequiv_fallback_strategy=textequiv_fallback_strategy
)
converter.convert()
print(converter)
with open(1 if output_file == '-' else output_file, 'w') as output:
kba marked this conversation as resolved.
Show resolved Hide resolved
output.write(str(converter))

if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter
2 changes: 1 addition & 1 deletion ocrd_page_to_alto/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def _convert_table(self, parent_alto, parent_page, level=0):
self._convert_textlines(textblock_alto, parent_page)

def convert_text(self):
for reg_page in self.page_page.get_AllRegions(depth=1):
for reg_page in self.page_page.get_AllRegions(depth=0):
reg_page_type = reg_page.__class__.__name__[0:-10] # len('RegionType') == 10
reg_alto_type = REGION_PAGE_TO_ALTO[reg_page_type]
if not reg_alto_type:
Expand Down
3 changes: 2 additions & 1 deletion ocrd_page_to_alto/styles.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def __init__(self):
def set_alto_tag_from_type(self, reg_alto, reg_page):
typ = reg_page.get_type() if hasattr(reg_page, 'get_type') else None
if typ:
reg_alto.set('TYPE', typ)
if hasattr(reg_alto, 'TYPE'):
reg_alto.set('TYPE', typ)
reg_alto.set('TAGREFS', self.get_id(label=typ))