From c086f914e92f2a8cfa6a26aea8b089b1dbd44af7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 23 May 2022 11:27:09 +0200 Subject: [PATCH 01/36] Update OCR-D/spec to 3.19.0 --- .../ocrd_validators/bagit-profile.yml | 8 +- .../ocrd_validators/ocrd_tool.schema.yml | 87 ++++++++++++++++++- repo/spec | 2 +- 3 files changed, 87 insertions(+), 10 deletions(-) diff --git a/ocrd_validators/ocrd_validators/bagit-profile.yml b/ocrd_validators/ocrd_validators/bagit-profile.yml index 3653b8e994..c0a82d4854 100644 --- a/ocrd_validators/ocrd_validators/bagit-profile.yml +++ b/ocrd_validators/ocrd_validators/bagit-profile.yml @@ -1,5 +1,5 @@ BagIt-Profile-Info: - BagIt-Profile-Identifier: https://ocr-d.de/bagit-profile.json + BagIt-Profile-Identifier: https://ocr-d.de/en/spec/bagit-profile.json BagIt-Profile-Version: '1.2.0' Source-Organization: OCR-D External-Description: BagIt profile for OCR data @@ -14,10 +14,6 @@ Bag-Info: Ocrd-Mets: required: false default: 'mets.xml' - Ocrd-Manifestation-Depth: - required: false - default: partial - values: ["partial", "full"] Ocrd-Identifier: required: true Ocrd-Checksum: @@ -34,7 +30,7 @@ Tag-Files-Allowed: - sources.csv - metadata/*.xml - metadata/*.txt -Allow-Fetch.txt: true +Allow-Fetch.txt: false Serialization: required Accept-Serialization: application/zip Accept-BagIt-Version: diff --git a/ocrd_validators/ocrd_validators/ocrd_tool.schema.yml b/ocrd_validators/ocrd_validators/ocrd_tool.schema.yml index d91fd52611..766fd892cc 100644 --- a/ocrd_validators/ocrd_validators/ocrd_tool.schema.yml +++ b/ocrd_validators/ocrd_validators/ocrd_tool.schema.yml @@ -41,13 +41,13 @@ properties: type: array items: type: string - pattern: '^OCR-D-[A-Z0-9-]+$' + # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: description: Output fileGrp@USE this tool produces by default type: array items: type: string - pattern: '^OCR-D-[A-Z0-9-]+$' + # pattern: '^OCR-D-[A-Z0-9-]+$' parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. type: object @@ -73,6 +73,30 @@ properties: description: Subtype, such as `float` for type `number` or `uri` for type `string`. description: description: Concise description of syntax and semantics of this parameter + items: + type: object + description: describe the items of an array further + minimum: + type: number + description: Minimum value for number parameters, including the minimum + maximum: + type: number + description: Maximum value for number parameters, including the maximum + exclusiveMinimum: + type: number + description: Minimum value for number parameters, excluding the minimum + exclusiveMaximum: + type: number + description: Maximum value for number parameters, excluding the maximum + multipleOf: + type: number + description: For number values, those values must be multiple of this number + properties: + type: object + description: Describe the properties of an object value + additionalProperties: + type: boolean + description: Whether an object value may contain properties not explicitly defined required: type: boolean description: Whether this parameter is required @@ -83,7 +107,15 @@ properties: description: List the allowed values if a fixed list. content-type: type: string - description: "If parameter is reference to file: Media type of the file" + default: 'application/octet-stream' + description: > + The media type of resources this processor expects for + this parameter. Most processors use files for resources + (e.g. `*.traineddata` for `ocrd-tesserocr-recognize`) + while others use directories of files (e.g. `default` for + `ocrd-eynollah-segment`). If a parameter requires + directories, it must set `content-type` to + `text/directory`. cacheable: type: boolean description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." @@ -126,3 +158,52 @@ properties: - layout/segmentation/word - layout/segmentation/classification - layout/analysis + resource_locations: + type: array + description: The locations in the filesystem this processor supports for resource lookup + default: ['data', 'cwd', 'system', 'module'] + items: + type: string + enum: ['data', 'cwd', 'system', 'module'] + resources: + type: array + description: Resources for this processor + items: + type: object + additionalProperties: false + required: + - url + - description + - name + - size + properties: + url: + type: string + description: URLs of all components of this resource + description: + type: string + description: A description of the resource + name: + type: string + description: Name to store the resource as + type: + type: string + enum: ['file', 'directory', 'archive'] + default: file + description: Type of the URL + parameter_usage: + type: string + description: Defines how the parameter is to be used + enum: ['as-is', 'without-extension'] + default: 'as-is' + path_in_archive: + type: string + description: if type is archive, the resource is at this location in the archive + default: '.' + version_range: + type: string + description: Range of supported versions, syntax like in PEP 440 + default: '>= 0.0.1' + size: + type: number + description: Size of the resource in bytes diff --git a/repo/spec b/repo/spec index 39b20c4ece..5ce548f9a2 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 39b20c4eced8417252ea7335e6968c47b325ca59 +Subproject commit 5ce548f9a2d9f764bf12a6d299081ccf00a5f09f From d7b2077b800df3333eebc2885c954177ab002108 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 23 May 2022 12:08:15 +0200 Subject: [PATCH 02/36] json-schema: upgrade to draft6 to support OCR-D/spec#206 --- .../ocrd_validators/json_validator.py | 8 +++---- .../ocrd_validators/parameter_validator.py | 4 ++-- .../resource_list_validator.py | 4 ++-- tests/validator/test_json_validator.py | 4 ++-- tests/validator/test_parameter_validator.py | 24 ++++++++++++++++++- 5 files changed, 33 insertions(+), 11 deletions(-) diff --git a/ocrd_validators/ocrd_validators/json_validator.py b/ocrd_validators/ocrd_validators/json_validator.py index 57a0a9a37c..c920fc7c2d 100644 --- a/ocrd_validators/ocrd_validators/json_validator.py +++ b/ocrd_validators/ocrd_validators/json_validator.py @@ -3,7 +3,7 @@ """ import json -from jsonschema import Draft4Validator, validators # pylint: disable=import-error +from jsonschema import Draft6Validator, validators # pylint: disable=import-error from ocrd_models import ValidationReport @@ -28,7 +28,7 @@ def set_defaults(validator, properties, instance, schema): return validators.extend(validator_class, {"properties": set_defaults}) -DefaultValidatingDraft4Validator = extend_with_default(Draft4Validator) +DefaultValidatingDraft6Validator = extend_with_default(Draft6Validator) # # ------------------------------------------------- @@ -52,13 +52,13 @@ def validate(obj, schema): obj = json.loads(obj) return JsonValidator(schema)._validate(obj) # pylint: disable=protected-access - def __init__(self, schema, validator_class=Draft4Validator): + def __init__(self, schema, validator_class=Draft6Validator): """ Construct a JsonValidator. Args: schema (dict): - validator_class (Draft4Validator|DefaultValidatingDraft4Validator): + validator_class (Draft6Validator|DefaultValidatingDraft6Validator): """ self.validator = validator_class(schema) diff --git a/ocrd_validators/ocrd_validators/parameter_validator.py b/ocrd_validators/ocrd_validators/parameter_validator.py index 91cb01fbb4..20dd6ff2b7 100644 --- a/ocrd_validators/ocrd_validators/parameter_validator.py +++ b/ocrd_validators/ocrd_validators/parameter_validator.py @@ -1,7 +1,7 @@ """ Validate parameters against ocrd-tool.json. """ -from .json_validator import JsonValidator, DefaultValidatingDraft4Validator +from .json_validator import JsonValidator, DefaultValidatingDraft6Validator # # ------------------------------------------------- @@ -45,4 +45,4 @@ def __init__(self, ocrd_tool): "required": required, "additionalProperties": False, "properties": p - }, DefaultValidatingDraft4Validator) + }, DefaultValidatingDraft6Validator) diff --git a/ocrd_validators/ocrd_validators/resource_list_validator.py b/ocrd_validators/ocrd_validators/resource_list_validator.py index ab1b53a2f6..72a11c34de 100644 --- a/ocrd_validators/ocrd_validators/resource_list_validator.py +++ b/ocrd_validators/ocrd_validators/resource_list_validator.py @@ -4,7 +4,7 @@ See `specs `_. """ from .constants import RESOURCE_LIST_SCHEMA -from .json_validator import JsonValidator, DefaultValidatingDraft4Validator +from .json_validator import JsonValidator, DefaultValidatingDraft6Validator # # ------------------------------------------------- @@ -20,5 +20,5 @@ def validate(obj, schema=RESOURCE_LIST_SCHEMA): """ Validate against ``resource_list.schema.yml`` schema. """ - return JsonValidator(schema, validator_class=DefaultValidatingDraft4Validator)._validate(obj) + return JsonValidator(schema, validator_class=DefaultValidatingDraft6Validator)._validate(obj) diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 546195326e..8a8387d4b6 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -1,5 +1,5 @@ from tests.base import TestCase, main -from ocrd_validators.json_validator import JsonValidator, DefaultValidatingDraft4Validator +from ocrd_validators.json_validator import JsonValidator, DefaultValidatingDraft6Validator class TestParameterValidator(TestCase): @@ -15,7 +15,7 @@ def setUp(self): } } } - self.defaults_validator = JsonValidator(self.schema, DefaultValidatingDraft4Validator) + self.defaults_validator = JsonValidator(self.schema, DefaultValidatingDraft6Validator) super().setUp() def test_validate_string(self): diff --git a/tests/validator/test_parameter_validator.py b/tests/validator/test_parameter_validator.py index f18937779a..f0d9d41d2c 100644 --- a/tests/validator/test_parameter_validator.py +++ b/tests/validator/test_parameter_validator.py @@ -45,6 +45,28 @@ def test_default_assignment(self): self.assertTrue(report.is_valid) self.assertEqual(obj, {'baz': '23', "num-param": 1}) +def test_min_max(): + validator = ParameterValidator({ + "parameters": { + "num-param": { + "type": "number", + "exclusiveMinimum": 10, + "maximum": 100, + "multipleOf": 2 + } + } + }) + report = validator.validate({'num-param': 23}) + assert not report.is_valid + assert 'is not a multiple of 2' in report.errors[0] + report = validator.validate({'num-param': 102}) + assert not report.is_valid + assert 'is greater than the maximum of' in report.errors[0] + report = validator.validate({'num-param': 8}) + assert not report.is_valid + assert 'is less than or equal to the minimum of' in report.errors[0] + + if __name__ == '__main__': - main() + main(__name__) From b252d5612771daa8eafb40323c3f6980db9c9c28 Mon Sep 17 00:00:00 2001 From: joschrew Date: Tue, 24 May 2022 15:59:31 +0200 Subject: [PATCH 03/36] replace pageId with page_id for code in ocrd (e.g. workspace), ocrd_models keep pageId. --- ocrd/ocrd/cli/workspace.py | 14 +++++++------- ocrd/ocrd/processor/builtin/dummy_processor.py | 6 +++--- ocrd/ocrd/workspace.py | 12 +++++++++--- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 84e2b5f733..262076c370 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -67,10 +67,10 @@ def workspace_cli(ctx, directory, mets, mets_basename, backup): def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency, page_coordinate_consistency): """ Validate a workspace - + METS_URL can be a URL, an absolute path or a path relative to $PWD. If not given, use --mets accordingly. - + Check that the METS and its referenced file contents abide by the OCR-D specifications. """ @@ -183,7 +183,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ except KeyError: log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname)) - kwargs = {'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore} + kwargs = {'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'page_id': page_id, 'force': force, 'ignore': ignore} log.debug("Adding '%s' (%s)", fname, kwargs) if not (fname.startswith('http://') or fname.startswith('https://')): if not fname.startswith(ctx.directory): @@ -306,7 +306,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp file_id_ = file_id or safe_filename(str(file_path)) # set up file info - file_dict = {'url': url, 'mimetype': mimetype, 'ID': file_id_, 'pageId': page_id, 'fileGrp': file_grp} + file_dict = {'url': url, 'mimetype': mimetype, 'ID': file_id_, 'page_id': page_id, 'fileGrp': file_grp} # guess mime type if not file_dict['mimetype']: @@ -428,7 +428,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefined-builtin """ Delete files (given by their ID attribute ``ID``). - + (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ @@ -467,7 +467,7 @@ def rename_group(ctx, old, new): def remove_group(ctx, group, recursive, force, keep_files): """ Delete fileGrps (given by their USE attribute ``GROUP``). - + (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ @@ -598,7 +598,7 @@ def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype fileGrp_mapping=filegrp_mapping, fileGrp=file_grp, ID=file_id, - pageId=page_id, + page_id=page_id, mimetype=mimetype, ) workspace.save_mets() diff --git a/ocrd/ocrd/processor/builtin/dummy_processor.py b/ocrd/ocrd/processor/builtin/dummy_processor.py index 9a1ad511e7..539caa3f62 100644 --- a/ocrd/ocrd/processor/builtin/dummy_processor.py +++ b/ocrd/ocrd/processor/builtin/dummy_processor.py @@ -42,7 +42,7 @@ def process(self): self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=to_xml(pcgts).encode('utf-8')) @@ -53,7 +53,7 @@ def process(self): self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=content) @@ -68,7 +68,7 @@ def process(self): self.workspace.add_file( ID=page_file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=page_filename, content=to_xml(pcgts).encode('utf-8')) diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index c4799592e6..bd254f97f1 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -33,6 +33,7 @@ polygon_from_points, xywh_from_bbox, pushd_popd, + deprecated_alias, MIME_TO_EXT, MIME_TO_PIL, MIMETYPE_PAGE, @@ -93,6 +94,7 @@ def reload_mets(self): """ self.mets = OcrdMets(filename=self.mets_target) + @deprecated_alias(pageId="page_id") def merge(self, other_workspace, copy_files=True, **kwargs): """ Merge ``other_workspace`` into this one @@ -114,6 +116,8 @@ def after_add_cb(f): makedirs(str(fpath_dest.parent)) with open(str(fpath_src), 'rb') as fstream_in, open(str(fpath_dest), 'wb') as fstream_out: copyfileobj(fstream_in, fstream_out) + if 'page_id' in kwargs: + kwargs['pageId'] = kwargs.pop('page_id') self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs) @@ -326,6 +330,7 @@ def rename_file_group(self, old, new): if Path(old).is_dir() and not listdir(old): Path(old).rmdir() + @deprecated_alias(pageId="page_id") def add_file(self, file_grp, content=None, **kwargs): """ Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace. @@ -345,8 +350,8 @@ def add_file(self, file_grp, content=None, **kwargs): file_grp, kwargs.get('local_filename'), content is not None) - if 'pageId' not in kwargs: - raise ValueError("workspace.add_file must be passed a 'pageId' kwarg, even if it is None.") + if 'page_id' not in kwargs: + raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.") if content is not None and 'local_filename' not in kwargs: raise Exception("'content' was set but no 'local_filename'") if self.overwrite_mode: @@ -362,6 +367,7 @@ def add_file(self, file_grp, content=None, **kwargs): kwargs['url'] = kwargs['local_filename'] # print(kwargs) + kwargs["pageId"] = kwargs.pop("page_id") ret = self.mets.add_file(file_grp, **kwargs) if content is not None: @@ -997,7 +1003,7 @@ def save_image_file(self, image, out = self.add_file( file_grp, ID=file_id, - pageId=page_id, + page_id=page_id, local_filename=file_path, mimetype=mimetype, content=image_bytes.getvalue(), From ef8b8518c686da8c0a346784c863321005e39a00 Mon Sep 17 00:00:00 2001 From: joschrew Date: Tue, 24 May 2022 16:00:09 +0200 Subject: [PATCH 04/36] change tests according to page_id change --- tests/cli/test_workspace.py | 4 ++-- tests/processor/test_processor.py | 28 ++++++++++++++-------------- tests/test_decorators.py | 8 ++++---- tests/test_task_sequence.py | 2 +- tests/test_workspace.py | 22 +++++++++++----------- 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index 1a7462040d..b728873fc9 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -53,7 +53,7 @@ def test_add(self): file_grp, ID=ID, content=content, - pageId=page_id, + page_id=page_id, mimetype=mimetype, local_filename=local_filename ) @@ -452,7 +452,7 @@ def test_bulk_add0(self): def test_bulk_add_missing_param(self): with pushd_popd(tempdir=True) as wsdir: ws = self.resolver.workspace_from_nothing(directory=wsdir) - with pytest.raises(ValueError, match=r"OcrdFile attribute 'pageId' unset"): + with pytest.raises(ValueError, match=r"OcrdFile attribute 'page_id' unset"): _, out, err = self.invoke_cli(workspace_cli, [ 'bulk-add', '-r', r'(?P.*) (?P.*) (?P.*) (?P.*) (?P.*) (?P.*)', diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 726f48681f..b83564b1fb 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -105,10 +105,10 @@ def test_zip_input_files(self): class ZipTestProcessor(Processor): pass with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', page_id='phys_0001') + ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', page_id='phys_0002') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) @@ -125,12 +125,12 @@ def test_zip_input_files_multi_mixed(self): class ZipTestProcessor(Processor): pass with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', pageId='phys_0001') - ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', pageId='phys_0001') - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', pageId='phys_0002') - ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', page_id='phys_0001') + ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', page_id='phys_0001') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', page_id='phys_0002') + ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) @@ -141,7 +141,7 @@ class ZipTestProcessor(Processor): pass print("PAGE-filtered") tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] assert ('foobar3', None) in tuples - ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', pageId='phys_0002') + ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) @@ -152,7 +152,7 @@ class ZipTestProcessor(Processor): pass assert ('foobar3', None) in tuples with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): tuples = proc.zip_input_files(on_error='abort') - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', pageId='phys_0001') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) @@ -164,8 +164,8 @@ class ZipTestProcessor(Processor): pass self.capture_out_err() with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId=None) - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', page_id=None) + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index c1debf5bce..2b44a13eda 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -119,10 +119,10 @@ def _sample_ws_for_overwrite(self): resolver = Resolver() with TemporaryDirectory() as tempdir: ws = resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('IN-GRP', pageId='pID1', ID='fID1', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID1.tif')) - ws.add_file('OUT-GRP', pageId='pID2', ID='fID2', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID2.tif')) - ws.add_file('OUT-GRP', pageId='pID3', ID='fID3', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID3.tif')) - ws.add_file('OUT-GRP', pageId='pID4', ID='fID4', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID4.tif')) + ws.add_file('IN-GRP', page_id='pID1', ID='fID1', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID1.tif')) + ws.add_file('OUT-GRP', page_id='pID2', ID='fID2', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID2.tif')) + ws.add_file('OUT-GRP', page_id='pID3', ID='fID3', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID3.tif')) + ws.add_file('OUT-GRP', page_id='pID4', ID='fID4', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID4.tif')) ws.save_mets() yield ws diff --git a/tests/test_task_sequence.py b/tests/test_task_sequence.py index e33da7c5f7..826b0c9665 100644 --- a/tests/test_task_sequence.py +++ b/tests/test_task_sequence.py @@ -138,7 +138,7 @@ def test_task_run(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): ws = resolver.workspace_from_url('mets.xml') - ws.add_file('GRP0', content='', local_filename='GRP0/foo', ID='file0', mimetype=MIMETYPE_PAGE, pageId=None) + ws.add_file('GRP0', content='', local_filename='GRP0/foo', ID='file0', mimetype=MIMETYPE_PAGE, page_id=None) ws.save_mets() files_before = len(ws.mets.find_all_files()) run_tasks('mets.xml', 'DEBUG', None, [ diff --git a/tests/test_workspace.py b/tests/test_workspace.py index de2c0ef833..45c838672e 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -66,7 +66,7 @@ def test_workspace_add_file(plain_workspace): ID='ID1', mimetype='image/tiff', content='CONTENT', - pageId=None, + page_id=None, local_filename=fpath ) f = plain_workspace.mets.find_all_files()[0] @@ -80,7 +80,7 @@ def test_workspace_add_file(plain_workspace): def test_workspace_add_file_basename_no_content(plain_workspace): - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', pageId=None) + plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', page_id=None) f = next(plain_workspace.mets.find_files()) # assert @@ -89,7 +89,7 @@ def test_workspace_add_file_basename_no_content(plain_workspace): def test_workspace_add_file_binary_content(plain_workspace): fpath = join(plain_workspace.directory, 'subdir', 'ID1.tif') - plain_workspace.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar', pageId=None) + plain_workspace.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar', page_id=None) # assert assert exists(fpath) @@ -98,7 +98,7 @@ def test_workspace_add_file_binary_content(plain_workspace): def test_workspacec_add_file_content_wo_local_filename(plain_workspace): # act with pytest.raises(Exception) as fn_exc: - plain_workspace.add_file('GRP', ID='ID1', content=b'CONTENT', pageId='foo1234') + plain_workspace.add_file('GRP', ID='ID1', content=b'CONTENT', page_id='foo1234') assert "'content' was set but no 'local_filename'" in str(fn_exc.value) @@ -108,7 +108,7 @@ def test_workspacec_add_file_content_wo_pageid(plain_workspace): with pytest.raises(ValueError) as val_err: plain_workspace.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename='foo') - assert "workspace.add_file must be passed a 'pageId' kwarg, even if it is None." in str(val_err.value) + assert "workspace.add_file must be passed a 'page_id' kwarg, even if it is None." in str(val_err.value) def test_workspace_str(plain_workspace): @@ -260,7 +260,7 @@ def test_remove_file_force(sbb_data_workspace): def test_remove_file_remote_not_available_raises_exception(plain_workspace): - plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', url='http://remote', pageId=None) + plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', url='http://remote', page_id=None) with pytest.raises(Exception) as not_avail_exc: plain_workspace.remove_file('page1_img') @@ -270,7 +270,7 @@ def test_remove_file_remote_not_available_raises_exception(plain_workspace): def test_remove_file_remote(plain_workspace): # act - plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', url='http://remote', pageId=None) + plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', url='http://remote', page_id=None) # must succeed because removal is enforced assert plain_workspace.remove_file('page1_img', force=True) @@ -342,7 +342,7 @@ def test_remove_file_group_flat(plain_workspace): """ # act - added_res = plain_workspace.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='file.ext', content='foo', pageId=None).url + added_res = plain_workspace.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='file.ext', content='foo', page_id=None).url # requires additional prepending of current path because not pushd_popd-magic at work added_path = Path(join(plain_workspace.directory, added_res)) @@ -382,8 +382,8 @@ def test_download_to_directory_from_workspace_download_file(plain_workspace): """ https://github.com/OCR-D/core/issues/342 """ - f1 = plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', pageId=None) - f2 = plain_workspace.add_file('GT', ID='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', pageId=None) + f1 = plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', page_id=None) + f2 = plain_workspace.add_file('GT', ID='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', page_id=None) assert f1.url == 'test.tif' assert f2.url == 'test.xml' @@ -577,7 +577,7 @@ def test_downsample_16bit_image(plain_workspace): tif_out.write(gzip_in.read()) # act - plain_workspace.add_file('IMG', ID='foo', url=img_path, mimetype='image/tiff', pageId=None) + plain_workspace.add_file('IMG', ID='foo', url=img_path, mimetype='image/tiff', page_id=None) # assert pil_before = Image.open(img_path) From 9f130a8c76035d0ffbf86be86ccdad8fc90f7a0d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 1 Jun 2022 12:59:18 +0200 Subject: [PATCH 05/36] update spec --- repo/spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo/spec b/repo/spec index 39b20c4ece..5ce548f9a2 160000 --- a/repo/spec +++ b/repo/spec @@ -1 +1 @@ -Subproject commit 39b20c4eced8417252ea7335e6968c47b325ca59 +Subproject commit 5ce548f9a2d9f764bf12a6d299081ccf00a5f09f From af646b6b00c34998df8d14ba337ca5106509ef87 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 1 Jun 2022 12:59:47 +0200 Subject: [PATCH 06/36] update bagit-profile and ocrd_tool.schema.yml from spec --- .../ocrd_validators/bagit-profile.yml | 8 +- .../ocrd_validators/ocrd_tool.schema.yml | 87 ++++++++++++++++++- 2 files changed, 86 insertions(+), 9 deletions(-) diff --git a/ocrd_validators/ocrd_validators/bagit-profile.yml b/ocrd_validators/ocrd_validators/bagit-profile.yml index 3653b8e994..c0a82d4854 100644 --- a/ocrd_validators/ocrd_validators/bagit-profile.yml +++ b/ocrd_validators/ocrd_validators/bagit-profile.yml @@ -1,5 +1,5 @@ BagIt-Profile-Info: - BagIt-Profile-Identifier: https://ocr-d.de/bagit-profile.json + BagIt-Profile-Identifier: https://ocr-d.de/en/spec/bagit-profile.json BagIt-Profile-Version: '1.2.0' Source-Organization: OCR-D External-Description: BagIt profile for OCR data @@ -14,10 +14,6 @@ Bag-Info: Ocrd-Mets: required: false default: 'mets.xml' - Ocrd-Manifestation-Depth: - required: false - default: partial - values: ["partial", "full"] Ocrd-Identifier: required: true Ocrd-Checksum: @@ -34,7 +30,7 @@ Tag-Files-Allowed: - sources.csv - metadata/*.xml - metadata/*.txt -Allow-Fetch.txt: true +Allow-Fetch.txt: false Serialization: required Accept-Serialization: application/zip Accept-BagIt-Version: diff --git a/ocrd_validators/ocrd_validators/ocrd_tool.schema.yml b/ocrd_validators/ocrd_validators/ocrd_tool.schema.yml index d91fd52611..766fd892cc 100644 --- a/ocrd_validators/ocrd_validators/ocrd_tool.schema.yml +++ b/ocrd_validators/ocrd_validators/ocrd_tool.schema.yml @@ -41,13 +41,13 @@ properties: type: array items: type: string - pattern: '^OCR-D-[A-Z0-9-]+$' + # pattern: '^OCR-D-[A-Z0-9-]+$' output_file_grp: description: Output fileGrp@USE this tool produces by default type: array items: type: string - pattern: '^OCR-D-[A-Z0-9-]+$' + # pattern: '^OCR-D-[A-Z0-9-]+$' parameters: description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas. type: object @@ -73,6 +73,30 @@ properties: description: Subtype, such as `float` for type `number` or `uri` for type `string`. description: description: Concise description of syntax and semantics of this parameter + items: + type: object + description: describe the items of an array further + minimum: + type: number + description: Minimum value for number parameters, including the minimum + maximum: + type: number + description: Maximum value for number parameters, including the maximum + exclusiveMinimum: + type: number + description: Minimum value for number parameters, excluding the minimum + exclusiveMaximum: + type: number + description: Maximum value for number parameters, excluding the maximum + multipleOf: + type: number + description: For number values, those values must be multiple of this number + properties: + type: object + description: Describe the properties of an object value + additionalProperties: + type: boolean + description: Whether an object value may contain properties not explicitly defined required: type: boolean description: Whether this parameter is required @@ -83,7 +107,15 @@ properties: description: List the allowed values if a fixed list. content-type: type: string - description: "If parameter is reference to file: Media type of the file" + default: 'application/octet-stream' + description: > + The media type of resources this processor expects for + this parameter. Most processors use files for resources + (e.g. `*.traineddata` for `ocrd-tesserocr-recognize`) + while others use directories of files (e.g. `default` for + `ocrd-eynollah-segment`). If a parameter requires + directories, it must set `content-type` to + `text/directory`. cacheable: type: boolean description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change." @@ -126,3 +158,52 @@ properties: - layout/segmentation/word - layout/segmentation/classification - layout/analysis + resource_locations: + type: array + description: The locations in the filesystem this processor supports for resource lookup + default: ['data', 'cwd', 'system', 'module'] + items: + type: string + enum: ['data', 'cwd', 'system', 'module'] + resources: + type: array + description: Resources for this processor + items: + type: object + additionalProperties: false + required: + - url + - description + - name + - size + properties: + url: + type: string + description: URLs of all components of this resource + description: + type: string + description: A description of the resource + name: + type: string + description: Name to store the resource as + type: + type: string + enum: ['file', 'directory', 'archive'] + default: file + description: Type of the URL + parameter_usage: + type: string + description: Defines how the parameter is to be used + enum: ['as-is', 'without-extension'] + default: 'as-is' + path_in_archive: + type: string + description: if type is archive, the resource is at this location in the archive + default: '.' + version_range: + type: string + description: Range of supported versions, syntax like in PEP 440 + default: '>= 0.0.1' + size: + type: number + description: Size of the resource in bytes From 2cba7dd09b6b2e098ec219f8c4eaac395d533a43 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 2 Jun 2022 16:00:45 +0200 Subject: [PATCH 07/36] :memo: changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a8b675bc1..91cc78f6f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * OCRD-ZIP: Drop `Ocrd-Manifestation-Depth` and disallow `fetch.txt`, OCR-D/spec#182 + * Parameters can now be described with most JSON-Schema constructs, OCR-D/spec#206, #848 + ## [2.34.0] - 2022-05-20 Added: From 43a3ff76e145168adedcb979e82e4f44e942468f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 2 Jun 2022 16:01:36 +0200 Subject: [PATCH 08/36] :package: v2.35.0 --- CHANGELOG.md | 3 +++ ocrd_utils/setup.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 91cc78f6f3..7b33cce54a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.35.0] - 2022-06-02 + Changed: * OCRD-ZIP: Drop `Ocrd-Manifestation-Depth` and disallow `fetch.txt`, OCR-D/spec#182 @@ -1487,6 +1489,7 @@ Fixed Initial Release +[2.35.0]: ../../compare/v2.35.0..v2.34.0 [2.34.0]: ../../compare/v2.34.0..v2.33.0 [2.33.0]: ../../compare/v2.33.0..v2.32.0 [2.32.0]: ../../compare/v2.32.0..v2.31.0 diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index b21ee051fb..d7ab560e4b 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='2.34.0', + version='2.35.0', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown', From a51b1023fbf0be99645b77cdaa4279682e6cf504 Mon Sep 17 00:00:00 2001 From: joschrew Date: Thu, 9 Jun 2022 09:14:46 +0200 Subject: [PATCH 09/36] undo wrong pageId conversion in workspace_add_file --- ocrd/ocrd/cli/workspace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 262076c370..e3094e1f7f 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -183,7 +183,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ except KeyError: log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname)) - kwargs = {'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'page_id': page_id, 'force': force, 'ignore': ignore} + kwargs = {'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore} log.debug("Adding '%s' (%s)", fname, kwargs) if not (fname.startswith('http://') or fname.startswith('https://')): if not fname.startswith(ctx.directory): From aad1d9dc01bb5c70cfbc9c5255b512d084714a7f Mon Sep 17 00:00:00 2001 From: joschrew Date: Thu, 9 Jun 2022 15:06:05 +0200 Subject: [PATCH 10/36] continue replacing camelCase with snake_case previously only pageId, now also ID and fileGrp --- ocrd/ocrd/cli/workspace.py | 8 ++--- .../ocrd/processor/builtin/dummy_processor.py | 6 ++-- ocrd/ocrd/workspace.py | 34 +++++++++++++------ 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index e3094e1f7f..b8ba5fa6fe 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -306,7 +306,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp file_id_ = file_id or safe_filename(str(file_path)) # set up file info - file_dict = {'url': url, 'mimetype': mimetype, 'ID': file_id_, 'page_id': page_id, 'fileGrp': file_grp} + file_dict = {'url': url, 'mimetype': mimetype, 'file_id': file_id_, 'page_id': page_id, 'file_grp': file_grp} # guess mime type if not file_dict['mimetype']: @@ -350,7 +350,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp destpath.write_bytes(srcpath.read_bytes()) # Add to workspace (or not) - fileGrp = file_dict.pop('fileGrp') + fileGrp = file_dict.pop('file_grp') if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: @@ -596,8 +596,8 @@ def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype other_workspace, copy_files=copy_files, fileGrp_mapping=filegrp_mapping, - fileGrp=file_grp, - ID=file_id, + file_grp=file_grp, + file_id=file_id, page_id=page_id, mimetype=mimetype, ) diff --git a/ocrd/ocrd/processor/builtin/dummy_processor.py b/ocrd/ocrd/processor/builtin/dummy_processor.py index 539caa3f62..459a164f30 100644 --- a/ocrd/ocrd/processor/builtin/dummy_processor.py +++ b/ocrd/ocrd/processor/builtin/dummy_processor.py @@ -40,7 +40,7 @@ def process(self): if input_file.mimetype == MIMETYPE_PAGE: # Source file is PAGE-XML: Write out in-memory PcGtsType self.workspace.add_file( - ID=file_id, + file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=input_file.mimetype, @@ -51,7 +51,7 @@ def process(self): with open(input_file.local_filename, 'rb') as f: content = f.read() self.workspace.add_file( - ID=file_id, + file_id=file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=input_file.mimetype, @@ -66,7 +66,7 @@ def process(self): LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) self.workspace.add_file( - ID=page_file_id, + file_id=page_file_id, file_grp=self.output_file_grp, page_id=input_file.pageId, mimetype=MIMETYPE_PAGE, diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index bd254f97f1..732b1d0dbc 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -95,6 +95,9 @@ def reload_mets(self): self.mets = OcrdMets(filename=self.mets_target) @deprecated_alias(pageId="page_id") + @deprecated_alias(ID="file_id") + @deprecated_alias(fileGrp="file_grp") + @deprecated_alias(fileGrp_mapping="filegrp_mapping") def merge(self, other_workspace, copy_files=True, **kwargs): """ Merge ``other_workspace`` into this one @@ -118,6 +121,13 @@ def after_add_cb(f): copyfileobj(fstream_in, fstream_out) if 'page_id' in kwargs: kwargs['pageId'] = kwargs.pop('page_id') + if 'file_id' in kwargs: + kwargs['ID'] = kwargs.pop('file_id') + if 'file_grp' in kwargs: + kwargs['fileGrp'] = kwargs.pop('file_grp') + if 'filegrp_mapping' in kwargs: + kwargs['fileGrp_mapping'] = kwargs.pop('filegrp_mapping') + self.mets.merge(other_workspace.mets, after_add_cb=after_add_cb, **kwargs) @@ -165,12 +175,12 @@ def download_file(self, f, _recursion_count=0): f.local_filename = f.url return f - def remove_file(self, ID, force=False, keep_file=False, page_recursive=False, page_same_group=False): + def remove_file(self, file_id, force=False, keep_file=False, page_recursive=False, page_same_group=False): """ Remove a METS `file` from the workspace. Arguments: - ID (string|:py:class:`ocrd_models.ocrd_file.OcrdFile`): `@ID` of the METS `file` + file_id (string|:py:class:`ocrd_models.ocrd_file.OcrdFile`): `@ID` of the METS `file` to delete or the file itself Keyword Args: force (boolean): Continue removing even if file not found in METS @@ -181,19 +191,19 @@ def remove_file(self, ID, force=False, keep_file=False, page_recursive=False, pa Has no effect unless ``page_recursive`` is `True`. """ log = getLogger('ocrd.workspace.remove_file') - log.debug('Deleting mets:file %s', ID) + log.debug('Deleting mets:file %s', file_id) if not force and self.overwrite_mode: force = True - if isinstance(ID, OcrdFile): - ID = ID.ID + if isinstance(file_id, OcrdFile): + file_id = file_id.ID try: try: - ocrd_file = next(self.mets.find_files(ID=ID)) + ocrd_file = next(self.mets.find_files(ID=file_id)) except StopIteration: - if ID.startswith(REGEX_PREFIX): + if file_id.startswith(REGEX_PREFIX): # allow empty results if filter criteria involve a regex return None - raise FileNotFoundError("File %s not found in METS" % ID) + raise FileNotFoundError("File %s not found in METS" % file_id) if page_recursive and ocrd_file.mimetype == MIMETYPE_PAGE: with pushd_popd(self.directory): ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True) @@ -213,7 +223,7 @@ def remove_file(self, ID, force=False, keep_file=False, page_recursive=False, pa log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory) unlink(ocrd_file.local_filename) # Remove from METS only after the recursion of AlternativeImages - self.mets.remove_file(ID) + self.mets.remove_file(file_id) return ocrd_file except FileNotFoundError as e: if not force: @@ -331,6 +341,7 @@ def rename_file_group(self, old, new): Path(old).rmdir() @deprecated_alias(pageId="page_id") + @deprecated_alias(ID="file_id") def add_file(self, file_grp, content=None, **kwargs): """ Add a file to the :py:class:`ocrd_models.ocrd_mets.OcrdMets` of the workspace. @@ -368,6 +379,9 @@ def add_file(self, file_grp, content=None, **kwargs): # print(kwargs) kwargs["pageId"] = kwargs.pop("page_id") + if "file_id" in kwargs: + kwargs["ID"] = kwargs.pop("file_id") + ret = self.mets.add_file(file_grp, **kwargs) if content is not None: @@ -1002,7 +1016,7 @@ def save_image_file(self, image, file_path = str(Path(file_grp, '%s%s' % (file_id, MIME_TO_EXT[mimetype]))) out = self.add_file( file_grp, - ID=file_id, + file_id=file_id, page_id=page_id, local_filename=file_path, mimetype=mimetype, From 6defdc2191ce9de40c6f5961e1c958d429664197 Mon Sep 17 00:00:00 2001 From: joschrew Date: Thu, 9 Jun 2022 15:24:57 +0200 Subject: [PATCH 11/36] change tests acording to params-snake_case-changes --- tests/cli/test_workspace.py | 2 +- tests/processor/test_processor.py | 28 ++++++++++++++-------------- tests/test_decorators.py | 8 ++++---- tests/test_task_sequence.py | 2 +- tests/test_workspace.py | 22 +++++++++++----------- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index b728873fc9..3400849461 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -51,7 +51,7 @@ def test_add(self): ws_api = self.resolver.workspace_from_nothing(directory=tempdir) ws_api.add_file( file_grp, - ID=ID, + file_id=ID, content=content, page_id=page_id, mimetype=mimetype, diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index b83564b1fb..57e09eec20 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -105,10 +105,10 @@ def test_zip_input_files(self): class ZipTestProcessor(Processor): pass with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', page_id='phys_0001') - ws.add_file('GRP2', mimetype='application/alto+xml', ID='foobar2', page_id='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', page_id='phys_0002') - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar4', page_id='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP2', mimetype='application/alto+xml', file_id='foobar2', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar3', page_id='phys_0002') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) @@ -125,12 +125,12 @@ def test_zip_input_files_multi_mixed(self): class ZipTestProcessor(Processor): pass with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', page_id='phys_0001') - ws.add_file('GRP1', mimetype='image/png', ID='foobar1img1', page_id='phys_0001') - ws.add_file('GRP1', mimetype='image/png', ID='foobar1img2', page_id='phys_0001') - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', page_id='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar3', page_id='phys_0002') - ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4', page_id='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype='image/png', file_id='foobar1img1', page_id='phys_0001') + ws.add_file('GRP1', mimetype='image/png', file_id='foobar1img2', page_id='phys_0001') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar3', page_id='phys_0002') + ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) @@ -141,7 +141,7 @@ class ZipTestProcessor(Processor): pass print("PAGE-filtered") tuples = [(one.ID, two) for one, two in proc.zip_input_files(mimetype=MIMETYPE_PAGE)] assert ('foobar3', None) in tuples - ws.add_file('GRP2', mimetype='image/tiff', ID='foobar4dup', page_id='phys_0002') + ws.add_file('GRP2', mimetype='image/tiff', file_id='foobar4dup', page_id='phys_0002') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) @@ -152,7 +152,7 @@ class ZipTestProcessor(Processor): pass assert ('foobar3', None) in tuples with self.assertRaisesRegex(Exception, "No PAGE-XML for page .* in fileGrp .* but multiple matches."): tuples = proc.zip_input_files(on_error='abort') - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2dup', page_id='phys_0001') + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2dup', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) @@ -164,8 +164,8 @@ class ZipTestProcessor(Processor): pass self.capture_out_err() with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', page_id=None) - ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, ID='foobar2', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id=None) + ws.add_file('GRP2', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0001') for page_id in [None, 'phys_0001,phys_0002']: with self.subTest(page_id=page_id): proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 2b44a13eda..ae2cec57ca 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -119,10 +119,10 @@ def _sample_ws_for_overwrite(self): resolver = Resolver() with TemporaryDirectory() as tempdir: ws = resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('IN-GRP', page_id='pID1', ID='fID1', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID1.tif')) - ws.add_file('OUT-GRP', page_id='pID2', ID='fID2', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID2.tif')) - ws.add_file('OUT-GRP', page_id='pID3', ID='fID3', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID3.tif')) - ws.add_file('OUT-GRP', page_id='pID4', ID='fID4', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID4.tif')) + ws.add_file('IN-GRP', page_id='pID1', file_id='fID1', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID1.tif')) + ws.add_file('OUT-GRP', page_id='pID2', file_id='fID2', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID2.tif')) + ws.add_file('OUT-GRP', page_id='pID3', file_id='fID3', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID3.tif')) + ws.add_file('OUT-GRP', page_id='pID4', file_id='fID4', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID4.tif')) ws.save_mets() yield ws diff --git a/tests/test_task_sequence.py b/tests/test_task_sequence.py index 826b0c9665..23300712bf 100644 --- a/tests/test_task_sequence.py +++ b/tests/test_task_sequence.py @@ -138,7 +138,7 @@ def test_task_run(self): with copy_of_directory(assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): ws = resolver.workspace_from_url('mets.xml') - ws.add_file('GRP0', content='', local_filename='GRP0/foo', ID='file0', mimetype=MIMETYPE_PAGE, page_id=None) + ws.add_file('GRP0', content='', local_filename='GRP0/foo', file_id='file0', mimetype=MIMETYPE_PAGE, page_id=None) ws.save_mets() files_before = len(ws.mets.find_all_files()) run_tasks('mets.xml', 'DEBUG', None, [ diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 45c838672e..e7eeac78b8 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -63,7 +63,7 @@ def test_workspace_add_file(plain_workspace): # act plain_workspace.add_file( 'GRP', - ID='ID1', + file_id='ID1', mimetype='image/tiff', content='CONTENT', page_id=None, @@ -80,7 +80,7 @@ def test_workspace_add_file(plain_workspace): def test_workspace_add_file_basename_no_content(plain_workspace): - plain_workspace.add_file('GRP', ID='ID1', mimetype='image/tiff', page_id=None) + plain_workspace.add_file('GRP', file_id='ID1', mimetype='image/tiff', page_id=None) f = next(plain_workspace.mets.find_files()) # assert @@ -89,7 +89,7 @@ def test_workspace_add_file_basename_no_content(plain_workspace): def test_workspace_add_file_binary_content(plain_workspace): fpath = join(plain_workspace.directory, 'subdir', 'ID1.tif') - plain_workspace.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar', page_id=None) + plain_workspace.add_file('GRP', file_id='ID1', content=b'CONTENT', local_filename=fpath, url='http://foo/bar', page_id=None) # assert assert exists(fpath) @@ -98,7 +98,7 @@ def test_workspace_add_file_binary_content(plain_workspace): def test_workspacec_add_file_content_wo_local_filename(plain_workspace): # act with pytest.raises(Exception) as fn_exc: - plain_workspace.add_file('GRP', ID='ID1', content=b'CONTENT', page_id='foo1234') + plain_workspace.add_file('GRP', file_id='ID1', content=b'CONTENT', page_id='foo1234') assert "'content' was set but no 'local_filename'" in str(fn_exc.value) @@ -106,7 +106,7 @@ def test_workspacec_add_file_content_wo_local_filename(plain_workspace): def test_workspacec_add_file_content_wo_pageid(plain_workspace): # act with pytest.raises(ValueError) as val_err: - plain_workspace.add_file('GRP', ID='ID1', content=b'CONTENT', local_filename='foo') + plain_workspace.add_file('GRP', file_id='ID1', content=b'CONTENT', local_filename='foo') assert "workspace.add_file must be passed a 'page_id' kwarg, even if it is None." in str(val_err.value) @@ -260,7 +260,7 @@ def test_remove_file_force(sbb_data_workspace): def test_remove_file_remote_not_available_raises_exception(plain_workspace): - plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', url='http://remote', page_id=None) + plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', url='http://remote', page_id=None) with pytest.raises(Exception) as not_avail_exc: plain_workspace.remove_file('page1_img') @@ -270,7 +270,7 @@ def test_remove_file_remote_not_available_raises_exception(plain_workspace): def test_remove_file_remote(plain_workspace): # act - plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', url='http://remote', page_id=None) + plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', url='http://remote', page_id=None) # must succeed because removal is enforced assert plain_workspace.remove_file('page1_img', force=True) @@ -342,7 +342,7 @@ def test_remove_file_group_flat(plain_workspace): """ # act - added_res = plain_workspace.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='file.ext', content='foo', page_id=None).url + added_res = plain_workspace.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='file.ext', content='foo', page_id=None).url # requires additional prepending of current path because not pushd_popd-magic at work added_path = Path(join(plain_workspace.directory, added_res)) @@ -382,8 +382,8 @@ def test_download_to_directory_from_workspace_download_file(plain_workspace): """ https://github.com/OCR-D/core/issues/342 """ - f1 = plain_workspace.add_file('IMG', ID='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', page_id=None) - f2 = plain_workspace.add_file('GT', ID='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', page_id=None) + f1 = plain_workspace.add_file('IMG', file_id='page1_img', mimetype='image/tiff', local_filename='test.tif', content='', page_id=None) + f2 = plain_workspace.add_file('GT', file_id='page1_gt', mimetype='text/xml', local_filename='test.xml', content='', page_id=None) assert f1.url == 'test.tif' assert f2.url == 'test.xml' @@ -577,7 +577,7 @@ def test_downsample_16bit_image(plain_workspace): tif_out.write(gzip_in.read()) # act - plain_workspace.add_file('IMG', ID='foo', url=img_path, mimetype='image/tiff', page_id=None) + plain_workspace.add_file('IMG', file_id='foo', url=img_path, mimetype='image/tiff', page_id=None) # assert pil_before = Image.open(img_path) From 35f854cdaa4bb93ec3dccaa450674fedcc67b98d Mon Sep 17 00:00:00 2001 From: jonas <> Date: Sun, 12 Jun 2022 21:01:53 +0200 Subject: [PATCH 12/36] workspace: find_files-delegator to OcrdMets --- ocrd/ocrd/cli/workspace.py | 16 ++++++++-------- ocrd/ocrd/workspace.py | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index b8ba5fa6fe..f3f06c1733 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -392,11 +392,11 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down modified_mets = False ret = list() workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) - for f in workspace.mets.find_files( - ID=file_id, - fileGrp=file_grp, + for f in workspace.find_files( + file_id=file_id, + file_grp=file_grp, mimetype=mimetype, - pageId=page_id, + page_id=page_id, ): if download and not f.local_filename: workspace.download_file(f) @@ -495,11 +495,11 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) with pushd_popd(workspace.directory): - for f in workspace.mets.find_files( - ID=file_id, - fileGrp=file_grp, + for f in workspace.find_files( + file_id=file_id, + file_grp=file_grp, mimetype=mimetype, - pageId=page_id, + page_id=page_id, ): try: if not f.local_filename or not exists(f.local_filename): diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index 732b1d0dbc..1024f04a93 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -1026,6 +1026,28 @@ def save_image_file(self, image, file_id, file_grp, out.local_filename) return file_path + def find_files(self, **kwargs): + """ + Search ``mets:file`` entries in wrapped METS document and yield results. + + Delegator to :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files` + + Keyword Args: + **kwargs: See :py:func:`ocrd_models.ocrd_mets.OcrdMets.find_files` + Returns: + Generator which yields :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations + """ + log = getLogger('ocrd.workspace.find_files') + log.debug('find files in mets. kwargs=%s' % kwargs) + if "page_id" in kwargs: + kwargs["pageId"] = kwargs.pop("page_id") + if "file_id" in kwargs: + kwargs["ID"] = kwargs.pop("file_id") + if "file_grp" in kwargs: + kwargs["fileGrp"] = kwargs.pop("file_grp") + with pushd_popd(self.directory): + return self.mets.find_files(**kwargs) + def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs): segment_coords = parent_coords.copy() # get polygon outline of segment relative to parent image: From f3dc8529437d4380da6544dd9ce7c7a992051e4d Mon Sep 17 00:00:00 2001 From: joschrew Date: Tue, 14 Jun 2022 13:57:26 +0200 Subject: [PATCH 13/36] update output_field in workspace_find add possible output_field values in workspace_find to allow providing them with snake_case in addition to old camelCase-naming --- ocrd/ocrd/cli/workspace.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index f3f06c1733..3feecec798 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -372,8 +372,11 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp type=click.Choice([ 'url', 'mimetype', + 'page_id', 'pageId', + 'file_id', 'ID', + 'file_grp', 'fileGrp', 'basename', 'basename_without_extension', @@ -389,6 +392,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"} + output_field = [x if x not in snake_to_camel else snake_to_camel[x] for x in output_field] modified_mets = False ret = list() workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) From 06921d924d4d558f5636572db7294496291c93b3 Mon Sep 17 00:00:00 2001 From: joschrew <91774427+joschrew@users.noreply.github.com> Date: Tue, 14 Jun 2022 14:23:59 +0200 Subject: [PATCH 14/36] refactor last commit a bit Update ocrd/ocrd/cli/workspace.py Co-authored-by: Konstantin Baierer --- ocrd/ocrd/cli/workspace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 3feecec798..d5f0f08473 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -393,7 +393,7 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down will be interpreted as a regular expression.) """ snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"} - output_field = [x if x not in snake_to_camel else snake_to_camel[x] for x in output_field] + output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False ret = list() workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) From 42b54aa76c4e673ab6238d84a6c4a1f94a0f94d4 Mon Sep 17 00:00:00 2001 From: joschrew Date: Tue, 14 Jun 2022 14:48:08 +0200 Subject: [PATCH 15/36] small bugfix in OcrdMets.find_files --- ocrd_models/ocrd_models/ocrd_mets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 8161684c58..66d43c27a7 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -167,7 +167,7 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None pageIds_expanded = [] for pageId_ in pageIds: if '..' in pageId_: - pageIds_expanded += generate_range(*pageId_.split('..', 2)) + pageIds_expanded += generate_range(*pageId_.split('..', 1)) pageIds += pageIds_expanded for page in self._tree.getroot().xpath( '//mets:div[@TYPE="page"]', namespaces=NS): From 8c6374534ffe43c7a5bce55fb3824cbc5b696bda Mon Sep 17 00:00:00 2001 From: jonas <> Date: Tue, 14 Jun 2022 20:05:38 +0200 Subject: [PATCH 16/36] parameter passing in find_files delegator --- ocrd/ocrd/workspace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index 1024f04a93..ee3eb1ff48 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -1026,7 +1026,7 @@ def save_image_file(self, image, file_id, file_grp, out.local_filename) return file_path - def find_files(self, **kwargs): + def find_files(self, *args, **kwargs): """ Search ``mets:file`` entries in wrapped METS document and yield results. @@ -1046,7 +1046,7 @@ def find_files(self, **kwargs): if "file_grp" in kwargs: kwargs["fileGrp"] = kwargs.pop("file_grp") with pushd_popd(self.directory): - return self.mets.find_files(**kwargs) + return self.mets.find_files(*args, **kwargs) def _crop(log, name, segment, parent_image, parent_coords, op='cropped', **kwargs): segment_coords = parent_coords.copy() From 178d45ea02b841bbafe18e0f3f4077e5fb7c0adc Mon Sep 17 00:00:00 2001 From: joschrew Date: Sun, 19 Jun 2022 14:28:23 +0200 Subject: [PATCH 17/36] workspace_add_file: use add_file-delegator --- ocrd/ocrd/cli/workspace.py | 9 ++++----- ocrd/ocrd/workspace.py | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index d5f0f08473..6dc6512b6b 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -183,8 +183,8 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ except KeyError: log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname)) - kwargs = {'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore} - log.debug("Adding '%s' (%s)", fname, kwargs) + log.debug("Adding '%s'", fname) + local_filename = None if not (fname.startswith('http://') or fname.startswith('https://')): if not fname.startswith(ctx.directory): if not isabs(fname) and exists(join(ctx.directory, fname)): @@ -202,12 +202,11 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ sys.exit(1) if fname.startswith(ctx.directory): fname = relpath(fname, ctx.directory) - kwargs['local_filename'] = fname + local_filename = fname - kwargs['url'] = fname if not page_id: log.warning("You did not provide '--page-id/-g', so the file you added is not linked to a specific page.") - workspace.mets.add_file(**kwargs) + workspace.add_file(file_grp, file_id=file_id, mimetype=mimetype, page_id=page_id, force=force, ignore=ignore, local_filename=local_filename, url=fname) workspace.save_mets() # ---------------------------------------------------------------------- diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index ee3eb1ff48..e804a674f8 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -363,13 +363,13 @@ def add_file(self, file_grp, content=None, **kwargs): content is not None) if 'page_id' not in kwargs: raise ValueError("workspace.add_file must be passed a 'page_id' kwarg, even if it is None.") - if content is not None and 'local_filename' not in kwargs: + if content is not None and not kwargs.get('local_filename'): raise Exception("'content' was set but no 'local_filename'") if self.overwrite_mode: kwargs['force'] = True with pushd_popd(self.directory): - if 'local_filename' in kwargs: + if kwargs.get('local_filename'): # If the local filename has folder components, create those folders local_filename_dir = kwargs['local_filename'].rsplit('/', 1)[0] if local_filename_dir != kwargs['local_filename'] and not Path(local_filename_dir).is_dir(): From ac31accbe71e65bbaeccba6e94227c3324ed334f Mon Sep 17 00:00:00 2001 From: joschrew Date: Sun, 19 Jun 2022 16:47:57 +0200 Subject: [PATCH 18/36] add 2 tests for recent camelCase-args changes --- tests/cli/test_workspace.py | 13 ++++++++++++- tests/test_workspace.py | 26 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index 3400849461..e0a3260c20 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -255,7 +255,6 @@ def test_add_existing_checked(self): f = ws.mets.find_all_files()[0] self.assertEqual(f.url, 'test.tif') - def test_find_all_files(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') @@ -265,6 +264,18 @@ def test_find_all_files(self): self.assertEqual(result.output, 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n') self.assertEqual(result.exit_code, 0) + def test_find_all_files_camelcase_outputfield(self): + with TemporaryDirectory() as tempdir: + wsdir = join(tempdir, 'ws') + copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) + with pushd_popd(wsdir): + result = self.runner.invoke(workspace_cli, + ['find', '-G', 'OCR-D-IMG-BIN', '-k', + 'file_grp', '-k', 'file_id', '-k', 'page_id']) + self.assertEqual(result.exit_code, 0) + self.assertEqual(result.output, 'OCR-D-IMG-BIN\tFILE_0001_IMAGE_BIN\tPHYS_0001\n' + 'OCR-D-IMG-BIN\tFILE_0002_IMAGE_BIN\tPHYS_0002\n') + def test_prune_files(self): with TemporaryDirectory() as tempdir: copytree(assets.path_to('SBB0000F29300010000/data'), join(tempdir, 'ws')) diff --git a/tests/test_workspace.py b/tests/test_workspace.py index e7eeac78b8..6cc3fd0bef 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -621,5 +621,31 @@ def test_merge(tmp_path): assert exists(join(dst_path1, 'OCR-D-IMG/FILE_0001_IMAGE.tif')) +def test_merge_with_camelcase_args(plain_workspace, tmp_path): + # arrange + page_id1, file_id1, file_grp1 = 'page1', 'ID1', 'GRP1' + plain_workspace.add_file(file_grp1, file_id='ID1', mimetype='image/tiff', page_id='page1') + + dst_path2 = tmp_path / 'foo' + resolver = Resolver() + ws2 = resolver.workspace_from_nothing(directory=dst_path2) + page_id2, file_id2, file_grp2 = 'page2', 'ID2', 'GRP2' + ws2.add_file('GRP2', file_id=file_id2, mimetype='image/tiff', page_id=page_id2) + ws2.add_file('GRP2', file_id='ID2-2', mimetype='image/tiff', page_id='page3') + + # act + plain_workspace.merge(ws2, copy_files=False, page_id=page_id2, file_id=file_id2, + file_grp=file_grp2, filegrp_mapping={file_grp2: file_grp1}) + + # assert: + files = list(plain_workspace.find_files()) + assert len(files) == 2 + + for f in files: + assert f.fileGrp == file_grp1 + assert f.pageId in [page_id1, page_id2] + assert f.ID in [file_id1, file_id2] + + if __name__ == '__main__': main(__file__) From b5ae18f7003ac1250493ec2d110fe8aa4cde6c0b Mon Sep 17 00:00:00 2001 From: joschrew Date: Mon, 20 Jun 2022 08:16:09 +0200 Subject: [PATCH 19/36] rename previously added tests --- tests/cli/test_workspace.py | 2 +- tests/test_workspace.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cli/test_workspace.py b/tests/cli/test_workspace.py index e0a3260c20..807e07b722 100644 --- a/tests/cli/test_workspace.py +++ b/tests/cli/test_workspace.py @@ -264,7 +264,7 @@ def test_find_all_files(self): self.assertEqual(result.output, 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n') self.assertEqual(result.exit_code, 0) - def test_find_all_files_camelcase_outputfield(self): + def test_find_all_files_outputfield(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) diff --git a/tests/test_workspace.py b/tests/test_workspace.py index 6cc3fd0bef..092cbc6379 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -621,7 +621,7 @@ def test_merge(tmp_path): assert exists(join(dst_path1, 'OCR-D-IMG/FILE_0001_IMAGE.tif')) -def test_merge_with_camelcase_args(plain_workspace, tmp_path): +def test_merge_with_filter(plain_workspace, tmp_path): # arrange page_id1, file_id1, file_grp1 = 'page1', 'ID1', 'GRP1' plain_workspace.add_file(file_grp1, file_id='ID1', mimetype='image/tiff', page_id='page1') From 07c9b40e7bfd5b907a777960db124ac63012a326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=BCnsebeck?= Date: Thu, 23 Jun 2022 14:17:09 +0200 Subject: [PATCH 20/36] Use importlib.metadata.version instead of pkg_resources.get_distribution --- ocrd/ocrd/workspace_bagger.py | 12 ++++++++---- ocrd_utils/ocrd_utils/constants.py | 7 +++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py index 28ae155b9a..6ba6db7160 100644 --- a/ocrd/ocrd/workspace_bagger.py +++ b/ocrd/ocrd/workspace_bagger.py @@ -6,8 +6,6 @@ import re import tempfile import sys - -from pkg_resources import get_distribution from bagit import Bag, make_manifests # pylint: disable=no-name-in-module from ocrd_utils import ( @@ -25,6 +23,12 @@ from .workspace import Workspace +try: + from importlib.metadata import version +except ImportError: + from importlib_metadata import version + + tempfile.tempdir = '/tmp' # TODO hard-coded BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup') @@ -123,8 +127,8 @@ def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_man bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % ( VERSION, # TODO - get_distribution('bagit').version, - get_distribution('bagit_profile').version, + version('bagit'), + version('bagit_profile'), ' '.join(sys.argv)) bag.info['Ocrd-Identifier'] = ocrd_identifier diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index 121e5df612..6f0202dde1 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -1,10 +1,13 @@ """ Constants for ocrd_utils. """ -from pkg_resources import get_distribution from re import compile as regex_compile from os import environ from os.path import join, expanduser +try: + from importlib.metadata import version +except ImportError: + from importlib_metadata import version __all__ = [ 'EXT_TO_MIME', @@ -22,7 +25,7 @@ 'XDG_DATA_HOME', ] -VERSION = get_distribution('ocrd_utils').version +VERSION = version('ocrd_utils') MIMETYPE_PAGE = 'application/vnd.prima.page+xml' From 0b7c125804213f17e1a2f6754575ff28aa13b855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=BCnsebeck?= Date: Thu, 23 Jun 2022 14:18:36 +0200 Subject: [PATCH 21/36] Use importlib based replacements instead of pkg_resources.resource_* --- ocrd/ocrd/constants.py | 2 +- ocrd/requirements.txt | 2 + ocrd_models/ocrd_models/constants.py | 2 +- ocrd_utils/ocrd_utils/package_resources.py | 45 ++++++++++++++++++++ ocrd_validators/ocrd_validators/constants.py | 2 +- 5 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 ocrd_utils/ocrd_utils/package_resources.py diff --git a/ocrd/ocrd/constants.py b/ocrd/ocrd/constants.py index 1d436a7fa9..2e9c17c649 100644 --- a/ocrd/ocrd/constants.py +++ b/ocrd/ocrd/constants.py @@ -1,7 +1,7 @@ """ Constants for ocrd. """ -from pkg_resources import resource_filename +from ocrd_utils.package_resources import resource_filename __all__ = [ 'TMP_PREFIX', diff --git a/ocrd/requirements.txt b/ocrd/requirements.txt index 2da0163b74..42e6c16a99 100644 --- a/ocrd/requirements.txt +++ b/ocrd/requirements.txt @@ -8,3 +8,5 @@ Flask jsonschema pyyaml Deprecated == 1.2.0 +importlib_metadata;python_version<'3.8' +importlib_resources;python_version<'3.8' diff --git a/ocrd_models/ocrd_models/constants.py b/ocrd_models/ocrd_models/constants.py index 6c8b0e1017..b3fe89a4c9 100644 --- a/ocrd_models/ocrd_models/constants.py +++ b/ocrd_models/ocrd_models/constants.py @@ -1,7 +1,7 @@ """ Constants for ocrd_models. """ -from pkg_resources import resource_string +from ocrd_utils.package_resources import resource_string import re __all__ = [ diff --git a/ocrd_utils/ocrd_utils/package_resources.py b/ocrd_utils/ocrd_utils/package_resources.py new file mode 100644 index 0000000000..0f268793ea --- /dev/null +++ b/ocrd_utils/ocrd_utils/package_resources.py @@ -0,0 +1,45 @@ +import atexit +from contextlib import ExitStack +from pathlib import Path + +try: + from importlib.resources import path, read_binary +except ImportError: + from importlib_resources import path, read_binary # type: ignore + + +_file_manager = ExitStack() +atexit.register(_file_manager.close) + + +def resource_filename(package: str, resource: str) -> Path: + """ + Reimplementation of the function with the same name from pkg_resources + + Using importlib for better performance + + package : str + The package from where to start looking for resource (often __name__) + resource : str + The resource to look up + """ + parent_package = package.rsplit('.',1)[0] + return _file_manager.enter_context(path(parent_package, resource)) + + +def resource_string(package: str, resource: str) -> bytes: + """ + Reimplementation of the function with the same name from pkg_resources + + Using importlib for better performance + + package : str + The package from where to start looking for resource (often __name__) + resource : str + The resource to look up + """ + parent_package = package.rsplit('.',1)[0] + return read_binary(parent_package, resource) + + +__all__ = ['resource_filename', 'resource_string'] \ No newline at end of file diff --git a/ocrd_validators/ocrd_validators/constants.py b/ocrd_validators/ocrd_validators/constants.py index 25d2e0e53b..5497102f25 100644 --- a/ocrd_validators/ocrd_validators/constants.py +++ b/ocrd_validators/ocrd_validators/constants.py @@ -2,7 +2,7 @@ Constants for ocrd_validators. """ import yaml -from pkg_resources import resource_string, resource_filename +from ocrd_utils.package_resources import resource_string, resource_filename __all__ = [ 'OCRD_TOOL_SCHEMA', From 6f0728e0f1e97b686f3e4c23388c22ef5425cb25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=BCnsebeck?= Date: Thu, 23 Jun 2022 14:19:19 +0200 Subject: [PATCH 22/36] Use importlib based replacements instead of pkg_resources.resource_* and move ocrd-tool.json to a importlib findable place --- ocrd/ocrd/processor/builtin/dummy_processor.py | 4 ++-- ocrd/ocrd/processor/builtin/{dummy => }/ocrd-tool.json | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename ocrd/ocrd/processor/builtin/{dummy => }/ocrd-tool.json (100%) diff --git a/ocrd/ocrd/processor/builtin/dummy_processor.py b/ocrd/ocrd/processor/builtin/dummy_processor.py index 9a1ad511e7..cce37a7430 100644 --- a/ocrd/ocrd/processor/builtin/dummy_processor.py +++ b/ocrd/ocrd/processor/builtin/dummy_processor.py @@ -1,6 +1,6 @@ # pylint: disable=missing-module-docstring,invalid-name from os.path import join, basename -from pkg_resources import resource_string +from ocrd_utils.package_resources import resource_string import click @@ -17,7 +17,7 @@ ) from ocrd_modelfactory import page_from_file -OCRD_TOOL = parse_json_string_with_comments(resource_string(__name__, 'dummy/ocrd-tool.json').decode('utf8')) +OCRD_TOOL = parse_json_string_with_comments(resource_string(__name__, 'ocrd-tool.json').decode('utf8')) class DummyProcessor(Processor): """ diff --git a/ocrd/ocrd/processor/builtin/dummy/ocrd-tool.json b/ocrd/ocrd/processor/builtin/ocrd-tool.json similarity index 100% rename from ocrd/ocrd/processor/builtin/dummy/ocrd-tool.json rename to ocrd/ocrd/processor/builtin/ocrd-tool.json From 79651f24d9c3f8bef4e6c8e34bd789a26763c185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=BCnsebeck?= Date: Thu, 23 Jun 2022 14:20:00 +0200 Subject: [PATCH 23/36] upgrade setuptools fpr faster console_scripts entrypoint generation --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6da911a0f2..54396a0d13 100644 --- a/Makefile +++ b/Makefile @@ -68,7 +68,7 @@ deps-test: # (Re)install the tool install: - $(PIP) install -U pip wheel + $(PIP) install -U pip wheel setuptools for mod in $(BUILD_ORDER);do (cd $$mod ; $(PIP_INSTALL) .);done # Install with pip install -e From 50eae8b02b98d296ba600efcff7543c159daf0a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=BCnsebeck?= Date: Thu, 23 Jun 2022 14:20:40 +0200 Subject: [PATCH 24/36] Use fastentrypoints --- Makefile | 2 +- ocrd/setup.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 54396a0d13..2d5ebda7e9 100644 --- a/Makefile +++ b/Makefile @@ -68,7 +68,7 @@ deps-test: # (Re)install the tool install: - $(PIP) install -U pip wheel setuptools + $(PIP) install -U pip wheel setuptools fastentrypoints for mod in $(BUILD_ORDER);do (cd $$mod ; $(PIP_INSTALL) .);done # Install with pip install -e diff --git a/ocrd/setup.py b/ocrd/setup.py index 0c8c0fa2ae..0269893e28 100644 --- a/ocrd/setup.py +++ b/ocrd/setup.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import fastentrypoints from setuptools import setup, find_packages from ocrd_utils import VERSION From cf3eb36133a1ae6cf1fef58d183b861b16f08b0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=BCnsebeck?= Date: Wed, 29 Jun 2022 20:48:21 +0200 Subject: [PATCH 25/36] Move importlib_* requirements to correct module --- ocrd/requirements.txt | 4 +--- ocrd_utils/requirements.txt | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd/requirements.txt b/ocrd/requirements.txt index 42e6c16a99..ca62ed9370 100644 --- a/ocrd/requirements.txt +++ b/ocrd/requirements.txt @@ -7,6 +7,4 @@ opencv-python-headless Flask jsonschema pyyaml -Deprecated == 1.2.0 -importlib_metadata;python_version<'3.8' -importlib_resources;python_version<'3.8' +Deprecated == 1.2.0 \ No newline at end of file diff --git a/ocrd_utils/requirements.txt b/ocrd_utils/requirements.txt index 300ed90949..de4e7adee3 100644 --- a/ocrd_utils/requirements.txt +++ b/ocrd_utils/requirements.txt @@ -3,3 +3,5 @@ Pillow >= 7.2.0 # tensorflow versions might require different versions numpy atomicwrites >= 1.3.0 +importlib_metadata;python_version<'3.8' +importlib_resources;python_version<'3.8' From 0caa1a1f955c801aa759b04c61c7a5be62175284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=BCnsebeck?= Date: Wed, 29 Jun 2022 20:48:58 +0200 Subject: [PATCH 26/36] Use importlib based replacement for pkg_resources get_distribution --- ocrd/ocrd/workspace_bagger.py | 11 +++-------- ocrd_utils/ocrd_utils/constants.py | 8 +++----- ocrd_utils/ocrd_utils/package_resources.py | 6 +++++- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py index 6ba6db7160..ac215fa7e7 100644 --- a/ocrd/ocrd/workspace_bagger.py +++ b/ocrd/ocrd/workspace_bagger.py @@ -20,15 +20,10 @@ from ocrd_validators.constants import BAGIT_TXT, TMP_BAGIT_PREFIX, OCRD_BAGIT_PROFILE_URL from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import to_xml +from ocrd_utils.package_resources import get_distribution from .workspace import Workspace -try: - from importlib.metadata import version -except ImportError: - from importlib_metadata import version - - tempfile.tempdir = '/tmp' # TODO hard-coded BACKUPDIR = join('/tmp', TMP_BAGIT_PREFIX + 'backup') @@ -127,8 +122,8 @@ def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_man bag.info['BagIt-Profile-Identifier'] = OCRD_BAGIT_PROFILE_URL bag.info['Bag-Software-Agent'] = 'ocrd/core %s (bagit.py %s, bagit_profile %s) [cmdline: "%s"]' % ( VERSION, # TODO - version('bagit'), - version('bagit_profile'), + get_distribution('bagit').version, + get_distribution('bagit_profile').version, ' '.join(sys.argv)) bag.info['Ocrd-Identifier'] = ocrd_identifier diff --git a/ocrd_utils/ocrd_utils/constants.py b/ocrd_utils/ocrd_utils/constants.py index 6f0202dde1..1164025081 100644 --- a/ocrd_utils/ocrd_utils/constants.py +++ b/ocrd_utils/ocrd_utils/constants.py @@ -4,10 +4,8 @@ from re import compile as regex_compile from os import environ from os.path import join, expanduser -try: - from importlib.metadata import version -except ImportError: - from importlib_metadata import version + +from ocrd_utils.package_resources import get_distribution __all__ = [ 'EXT_TO_MIME', @@ -25,7 +23,7 @@ 'XDG_DATA_HOME', ] -VERSION = version('ocrd_utils') +VERSION = get_distribution('ocrd_utils').version MIMETYPE_PAGE = 'application/vnd.prima.page+xml' diff --git a/ocrd_utils/ocrd_utils/package_resources.py b/ocrd_utils/ocrd_utils/package_resources.py index 0f268793ea..57644ba379 100644 --- a/ocrd_utils/ocrd_utils/package_resources.py +++ b/ocrd_utils/ocrd_utils/package_resources.py @@ -7,6 +7,10 @@ except ImportError: from importlib_resources import path, read_binary # type: ignore +try: + from importlib.metadata import distribution as get_distribution +except ImportError: + from importlib_metadata import distribution as get_distribution _file_manager = ExitStack() atexit.register(_file_manager.close) @@ -42,4 +46,4 @@ def resource_string(package: str, resource: str) -> bytes: return read_binary(parent_package, resource) -__all__ = ['resource_filename', 'resource_string'] \ No newline at end of file +__all__ = ['resource_filename', 'resource_string', 'get_distribution'] \ No newline at end of file From 2cd4cb337c803a9b6aae378ea0729cb5dca4d8a2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 13 Jul 2022 20:37:24 +0200 Subject: [PATCH 27/36] workspace.merge: if not copying files, at least rebsae the relative paths --- ocrd/ocrd/workspace.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index 4d4a957c75..b276ddf0ce 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -103,7 +103,12 @@ def merge(self, other_workspace, copy_files=True, **kwargs): copy_files (boolean): Whether to copy files from `other_workspace` to this one """ def after_add_cb(f): + """callback to run on merged OcrdFile instances in the destination""" if not copy_files: + fpath_src = Path(other_workspace.directory).resolve() + fpath_dst = Path(self.directory).resolve() + dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath + f.url = str(Path(dstprefix, f.url)) return fpath_src = Path(other_workspace.directory, f.url) fpath_dest = Path(self.directory, f.url) From 6a0da65c9042cd8e686e3516733658b9d7325306 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 13 Jul 2022 20:38:29 +0200 Subject: [PATCH 28/36] workspace.merge: add fileId-mapping and pageId-mapping, allow passing JSON strings or files for all 3 mappings --- ocrd/ocrd/cli/workspace.py | 12 +++++++++++- ocrd_models/ocrd_models/ocrd_mets.py | 14 +++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 84e2b5f733..830152f856 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -573,16 +573,24 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin # ocrd workspace merge # ---------------------------------------------------------------------- +def _handle_json_option(ctx, param, value): + return parse_json_string_or_file(value) if value else None + @workspace_cli.command('merge') @click.argument('METS_PATH') @click.option('--copy-files/--no-copy-files', is_flag=True, help="Copy files as well", default=True, show_default=True) -@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp") +@click.option('--fileGrp-mapping', help="JSON object mapping src to dest fileGrp", callback=_handle_json_option) +@click.option('--fileId-mapping', help="JSON object mapping src to dest file ID", callback=_handle_json_option) +@click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option) @mets_find_options @pass_workspace def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype, mets_path): # pylint: disable=redefined-builtin """ Merges this workspace with the workspace that contains ``METS_PATH`` + Pass a JSON string or file to ``--fileGrp-mapping``, ``--fileId-mapping`` or ``--pageId-mapping`` + in order to rename all fileGrp, file ID or page ID values, respectively. + The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help`` for an explanation. @@ -596,6 +604,8 @@ def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype other_workspace, copy_files=copy_files, fileGrp_mapping=filegrp_mapping, + fileId_mapping=fileid_mapping, + pageId_mapping=pageid_mapping, fileGrp=file_grp, ID=file_id, pageId=page_id, diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 8161684c58..07c4b73519 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -485,7 +485,7 @@ def remove_physical_page_fptr(self, fileId): mets_div.remove(mets_fptr) return ret - def merge(self, other_mets, fileGrp_mapping=None, after_add_cb=None, **kwargs): + def merge(self, other_mets, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs): """ Add all files from other_mets. @@ -493,16 +493,24 @@ def merge(self, other_mets, fileGrp_mapping=None, after_add_cb=None, **kwargs): Keyword Args: fileGrp_mapping (dict): Map :py:attr:`other_mets` fileGrp to fileGrp in this METS + fileId_mapping (dict): Map :py:attr:`other_mets` file ID to file ID in this METS + pageId_mapping (dict): Map :py:attr:`other_mets` page ID to page ID in this METS after_add_cb (function): Callback received after file is added to the METS """ if not fileGrp_mapping: fileGrp_mapping = {} + if not fileId_mapping: + fileId_mapping = {} + if not pageId_mapping: + pageId_mapping = {} for f_src in other_mets.find_files(**kwargs): f_dest = self.add_file( fileGrp_mapping.get(f_src.fileGrp, f_src.fileGrp), mimetype=f_src.mimetype, url=f_src.url, - ID=f_src.ID, - pageId=f_src.pageId) + ID=fileId_mapping.get(f_src.ID, f_src.ID), + pageId=pageId_mapping.get(f_src.pageId, f_src.pageId)) + # FIXME: merge metsHdr, amdSec, dmdSec as well + # FIXME: merge structMap logical and structLink as well if after_add_cb: after_add_cb(f_dest) From cb8ea41caa2c53bf29216e2127037157b3070a80 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 13 Jul 2022 20:44:00 +0200 Subject: [PATCH 29/36] workspace.merge: only rebase file URLs if local paths --- ocrd/ocrd/workspace.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/workspace.py b/ocrd/ocrd/workspace.py index b276ddf0ce..781d7681a5 100644 --- a/ocrd/ocrd/workspace.py +++ b/ocrd/ocrd/workspace.py @@ -33,6 +33,7 @@ polygon_from_points, xywh_from_bbox, pushd_popd, + is_local_filename, MIME_TO_EXT, MIME_TO_PIL, MIMETYPE_PAGE, @@ -108,7 +109,8 @@ def after_add_cb(f): fpath_src = Path(other_workspace.directory).resolve() fpath_dst = Path(self.directory).resolve() dstprefix = fpath_src.relative_to(fpath_dst) # raises ValueError if not a subpath - f.url = str(Path(dstprefix, f.url)) + if is_local_filename(f.url): + f.url = str(Path(dstprefix, f.url)) return fpath_src = Path(other_workspace.directory, f.url) fpath_dest = Path(self.directory, f.url) From b7e575499d9b30f3c39ca02a03dfee0415c66b91 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 Jul 2022 15:50:42 +0200 Subject: [PATCH 30/36] utils.package_resources: reference migration guide --- ocrd_utils/ocrd_utils/package_resources.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_utils/ocrd_utils/package_resources.py b/ocrd_utils/ocrd_utils/package_resources.py index 57644ba379..ee01d046f4 100644 --- a/ocrd_utils/ocrd_utils/package_resources.py +++ b/ocrd_utils/ocrd_utils/package_resources.py @@ -12,6 +12,7 @@ except ImportError: from importlib_metadata import distribution as get_distribution +# See https://importlib-resources.readthedocs.io/en/latest/migration.html#pkg-resources-resource-filename _file_manager = ExitStack() atexit.register(_file_manager.close) @@ -46,4 +47,4 @@ def resource_string(package: str, resource: str) -> bytes: return read_binary(parent_package, resource) -__all__ = ['resource_filename', 'resource_string', 'get_distribution'] \ No newline at end of file +__all__ = ['resource_filename', 'resource_string', 'get_distribution'] From eff0abd910b51798653dec588cb96bd9d887dc92 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 Jul 2022 15:52:19 +0200 Subject: [PATCH 31/36] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b33cce54a..48fdeb44bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * `ocrd_utils.generate_range`: `maxsplits` should be 1, not 2, #880 + ## [2.35.0] - 2022-06-02 Changed: From c6acd337b7f20b9177fcee2ffbd2b46dec4fc64a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 Jul 2022 17:16:14 +0200 Subject: [PATCH 32/36] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48fdeb44bb..7437d039ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ Fixed: * `ocrd_utils.generate_range`: `maxsplits` should be 1, not 2, #880 +Changed: + + * Consistenly use snake_case but continue to support CamelCase for kwargs and CLI options, #874, #862 + ## [2.35.0] - 2022-06-02 Changed: From 60c6c11db6e4a4c564db5b541f714013e440b64a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 Jul 2022 17:18:44 +0200 Subject: [PATCH 33/36] :memo: changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7437d039ac..0e5896870f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Fixed: Changed: * Consistenly use snake_case but continue to support CamelCase for kwargs and CLI options, #874, #862 + * Update to spec to 3.19.0, introducing greater flexibility in describing parameters, #872, #848, OCR-D/spec#206 ## [2.35.0] - 2022-06-02 From a674ea5799e8adf62fe5dd1a96e58a94d15007d9 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 Jul 2022 17:59:43 +0200 Subject: [PATCH 34/36] test no_copy_files and file overwrite of OcrdWorkspace.merge --- tests/test_workspace.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test_workspace.py b/tests/test_workspace.py index de2c0ef833..c6a8e3d4c3 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -620,6 +620,39 @@ def test_merge(tmp_path): assert len(ws1.mets.find_all_files()) == 41 assert exists(join(dst_path1, 'OCR-D-IMG/FILE_0001_IMAGE.tif')) +def test_merge_no_copy_files(tmp_path): + + # arrange + dst_path1 = tmp_path / 'ws1' + dst_path1.mkdir() + dst_path2 = dst_path1 / 'ws2' + dst_path2.mkdir() + + ws1 = Resolver().workspace_from_nothing(directory=dst_path1) + ws2 = Resolver().workspace_from_nothing(directory=dst_path2) + + ws2.add_file('GRP2', pageId='p01', mimetype='text/plain', ID='f1', local_filename='GRP2/f1', content='ws2') + + ws1.merge(ws2, copy_files=False, fileId_mapping={'f1': 'f1_copy_files'}) + assert next(ws1.mets.find_files(ID='f1_copy_files')).url == 'ws2/GRP2/f1' + ws1.merge(ws2, copy_files=True, fileId_mapping={'f1': 'f1_no_copy_files'}) + assert next(ws1.mets.find_files(ID='f1_no_copy_files')).url == 'GRP2/f1' + +def test_merge_overwrite(tmp_path): + # arrange + dst_path1 = tmp_path / 'ws1' + dst_path1.mkdir() + dst_path2 = dst_path1 / 'ws2' + dst_path2.mkdir() + + ws1 = Resolver().workspace_from_nothing(directory=dst_path1) + ws2 = Resolver().workspace_from_nothing(directory=dst_path2) + + with pytest.raises(Exception) as exc: + ws1.add_file('X', pageId='X', mimetype='X', ID='id123', local_filename='X/X', content='ws1') + ws2.add_file('X', pageId='X', mimetype='X', ID='id456', local_filename='X/X', content='ws2') + ws1.merge(ws2) + assert "would overwrite" == str(exc.value) if __name__ == '__main__': main(__file__) From a78d4c50e92b2acc8fcb0f18fdb23da366631b41 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 15 Jul 2022 18:20:29 +0200 Subject: [PATCH 35/36] :memo: changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e5896870f..af98c7694c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ Changed: * Consistenly use snake_case but continue to support CamelCase for kwargs and CLI options, #874, #862 * Update to spec to 3.19.0, introducing greater flexibility in describing parameters, #872, #848, OCR-D/spec#206 + * `ocrd workspace merge`: support mapping `file_id` and `page_id` in addition to `file_grp`, #886, #888 + * `ocrd workspace merge`: rebase `OcrdFile.url` to target workspace, #887, #888 ## [2.35.0] - 2022-06-02 From 6fa65d0fdf2ed9731d95000c026d58981c9108d6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 18 Jul 2022 18:11:59 +0200 Subject: [PATCH 36/36] :memo: changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index af98c7694c..ca519b8070 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: * `ocrd_utils.generate_range`: `maxsplits` should be 1, not 2, #880 + * Typos in CHANGELOG, README and code comments, #890 Changed: @@ -15,6 +16,7 @@ Changed: * Update to spec to 3.19.0, introducing greater flexibility in describing parameters, #872, #848, OCR-D/spec#206 * `ocrd workspace merge`: support mapping `file_id` and `page_id` in addition to `file_grp`, #886, #888 * `ocrd workspace merge`: rebase `OcrdFile.url` to target workspace, #887, #888 + * Replace `resource_filename` et al from pkg_resources with faster alternatives, #881, #882 ## [2.35.0] - 2022-06-02