Skip to content

Commit

Permalink
Merge branch 'release-2.36.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Jul 18, 2022
2 parents bd7aecb + 6fa65d0 commit 7523cc5
Show file tree
Hide file tree
Showing 26 changed files with 334 additions and 105 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Fixed:

* `ocrd_utils.generate_range`: `maxsplits` should be 1, not 2, #880
* Typos in CHANGELOG, README and code comments, #890

Changed:

* Consistenly use snake_case but continue to support CamelCase for kwargs and CLI options, #874, #862
* Update to spec to 3.19.0, introducing greater flexibility in describing parameters, #872, #848, OCR-D/spec#206
* `ocrd workspace merge`: support mapping `file_id` and `page_id` in addition to `file_grp`, #886, #888
* `ocrd workspace merge`: rebase `OcrdFile.url` to target workspace, #887, #888
* Replace `resource_filename` et al from pkg_resources with faster alternatives, #881, #882

## [2.35.0] - 2022-06-02

Changed:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ deps-test:

# (Re)install the tool
install:
$(PIP) install -U pip wheel
$(PIP) install -U pip wheel setuptools fastentrypoints
for mod in $(BUILD_ORDER);do (cd $$mod ; $(PIP_INSTALL) .);done

# Install with pip install -e
Expand Down
60 changes: 37 additions & 23 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ def workspace_cli(ctx, directory, mets, mets_basename, backup):
def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency):
"""
Validate a workspace
METS_URL can be a URL, an absolute path or a path relative to $PWD.
If not given, use --mets accordingly.
Check that the METS and its referenced file contents
abide by the OCR-D specifications.
"""
Expand Down Expand Up @@ -183,8 +183,8 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
except KeyError:
log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname))

kwargs = {'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore}
log.debug("Adding '%s' (%s)", fname, kwargs)
log.debug("Adding '%s'", fname)
local_filename = None
if not (fname.startswith('http://') or fname.startswith('https://')):
if not fname.startswith(ctx.directory):
if not isabs(fname) and exists(join(ctx.directory, fname)):
Expand All @@ -202,12 +202,11 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
sys.exit(1)
if fname.startswith(ctx.directory):
fname = relpath(fname, ctx.directory)
kwargs['local_filename'] = fname
local_filename = fname

kwargs['url'] = fname
if not page_id:
log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.")
workspace.mets.add_file(**kwargs)
workspace.add_file(file_grp, file_id=file_id, mimetype=mimetype, page_id=page_id, force=force, ignore=ignore, local_filename=local_filename, url=fname)
workspace.save_mets()

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -306,7 +305,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
file_id_ = file_id or safe_filename(str(file_path))

# set up file info
file_dict = {'url': url, 'mimetype': mimetype, 'ID': file_id_, 'pageId': page_id, 'fileGrp': file_grp}
file_dict = {'url': url, 'mimetype': mimetype, 'file_id': file_id_, 'page_id': page_id, 'file_grp': file_grp}

# guess mime type
if not file_dict['mimetype']:
Expand Down Expand Up @@ -350,7 +349,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
destpath.write_bytes(srcpath.read_bytes())

# Add to workspace (or not)
fileGrp = file_dict.pop('fileGrp')
fileGrp = file_dict.pop('file_grp')
if dry_run:
log.info('workspace.add_file(%s)' % file_dict)
else:
Expand All @@ -372,8 +371,11 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
type=click.Choice([
'url',
'mimetype',
'page_id',
'pageId',
'file_id',
'ID',
'file_grp',
'fileGrp',
'basename',
'basename_without_extension',
Expand All @@ -389,14 +391,16 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down
(If any ``FILTER`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
output_field = [snake_to_camel.get(x, x) for x in output_field]
modified_mets = False
ret = list()
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
for f in workspace.mets.find_files(
ID=file_id,
fileGrp=file_grp,
for f in workspace.find_files(
file_id=file_id,
file_grp=file_grp,
mimetype=mimetype,
pageId=page_id,
page_id=page_id,
):
if download and not f.local_filename:
workspace.download_file(f)
Expand Down Expand Up @@ -428,7 +432,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down
def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefined-builtin
"""
Delete files (given by their ID attribute ``ID``).
(If any ``ID`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
Expand Down Expand Up @@ -467,7 +471,7 @@ def rename_group(ctx, old, new):
def remove_group(ctx, group, recursive, force, keep_files):
"""
Delete fileGrps (given by their USE attribute ``GROUP``).
(If any ``GROUP`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
Expand Down Expand Up @@ -495,11 +499,11 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
with pushd_popd(workspace.directory):
for f in workspace.mets.find_files(
ID=file_id,
fileGrp=file_grp,
for f in workspace.find_files(
file_id=file_id,
file_grp=file_grp,
mimetype=mimetype,
pageId=page_id,
page_id=page_id,
):
try:
if not f.local_filename or not exists(f.local_filename):
Expand Down Expand Up @@ -573,16 +577,24 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin
# ocrd workspace merge
# ----------------------------------------------------------------------

def _handle_json_option(ctx, param, value):
return parse_json_string_or_file(value) if value else None

@workspace_cli.command('merge')
@click.argument('METS_PATH')
@click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True)
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp")
@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option)
@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option)
@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option)
@mets_find_options
@pass_workspace
def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype, mets_path): # pylint: disable=redefined-builtin
"""
Merges this workspace with the workspace that contains ``METS_PATH``
Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping``
in order to rename all fileGrp, file ID or page ID values, respectively.
The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
for an explanation.
Expand All @@ -596,9 +608,11 @@ def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype
other_workspace,
copy_files=copy_files,
fileGrp_mapping=filegrp_mapping,
fileGrp=file_grp,
ID=file_id,
pageId=page_id,
fileId_mapping=fileid_mapping,
pageId_mapping=pageid_mapping,
file_grp=file_grp,
file_id=file_id,
page_id=page_id,
mimetype=mimetype,
)
workspace.save_mets()
Expand Down
2 changes: 1 addition & 1 deletion ocrd/ocrd/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Constants for ocrd.
"""
from pkg_resources import resource_filename
from ocrd_utils.package_resources import resource_filename

__all__ = [
'TMP_PREFIX',
Expand Down
16 changes: 8 additions & 8 deletions ocrd/ocrd/processor/builtin/dummy_processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# pylint: disable=missing-module-docstring,invalid-name
from os.path import join, basename
from pkg_resources import resource_string
from ocrd_utils.package_resources import resource_string

import click

Expand All @@ -17,7 +17,7 @@
)
from ocrd_modelfactory import page_from_file

OCRD_TOOL = parse_json_string_with_comments(resource_string(__name__, 'dummy/ocrd-tool.json').decode('utf8'))
OCRD_TOOL = parse_json_string_with_comments(resource_string(__name__, 'ocrd-tool.json').decode('utf8'))

class DummyProcessor(Processor):
"""
Expand All @@ -40,9 +40,9 @@ def process(self):
if input_file.mimetype == MIMETYPE_PAGE:
# Source file is PAGE-XML: Write out in-memory PcGtsType
self.workspace.add_file(
ID=file_id,
file_id=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
page_id=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=local_filename,
content=to_xml(pcgts).encode('utf-8'))
Expand All @@ -51,9 +51,9 @@ def process(self):
with open(input_file.local_filename, 'rb') as f:
content = f.read()
self.workspace.add_file(
ID=file_id,
file_id=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
page_id=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=local_filename,
content=content)
Expand All @@ -66,9 +66,9 @@ def process(self):
LOG.info("Add PAGE-XML %s generated for %s at %s",
page_file_id, file_id, page_filename)
self.workspace.add_file(
ID=page_file_id,
file_id=page_file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
page_id=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename=page_filename,
content=to_xml(pcgts).encode('utf-8'))
Expand Down
File renamed without changes.
Loading

0 comments on commit 7523cc5

Please sign in to comment.