From 48cffa8f6e034b13e345e70a5be2c3689326f96c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 17 Nov 2023 15:21:11 +0100 Subject: [PATCH 1/5] remove accidentally added test data --- .../bag-info.txt | 5 --- .../bagger-conflict-workspace.ocrd/bagit.txt | 2 -- .../data/A/name.ext | 0 .../data/mets.xml | 34 ------------------- .../manifest-sha512.txt | 2 -- .../tagmanifest-sha512.txt | 3 -- 6 files changed, 46 deletions(-) delete mode 100644 tests/data/bagger-conflict-workspace.ocrd/bag-info.txt delete mode 100644 tests/data/bagger-conflict-workspace.ocrd/bagit.txt delete mode 100644 tests/data/bagger-conflict-workspace.ocrd/data/A/name.ext delete mode 100644 tests/data/bagger-conflict-workspace.ocrd/data/mets.xml delete mode 100644 tests/data/bagger-conflict-workspace.ocrd/manifest-sha512.txt delete mode 100644 tests/data/bagger-conflict-workspace.ocrd/tagmanifest-sha512.txt diff --git a/tests/data/bagger-conflict-workspace.ocrd/bag-info.txt b/tests/data/bagger-conflict-workspace.ocrd/bag-info.txt deleted file mode 100644 index eb24da509..000000000 --- a/tests/data/bagger-conflict-workspace.ocrd/bag-info.txt +++ /dev/null @@ -1,5 +0,0 @@ -Bag-Software-Agent: ocrd/core 2.58.1 (bagit.py 1.8.1, bagit_profile 1.3.1) [cmdline: "/home/kba/monorepo/ocrd_all/venv/bin/ocrd zip bag -Z -i bar"] -BagIt-Profile-Identifier: https://ocr-d.github.io/bagit-profile.json -Bagging-Date: 2023-11-15 18:33:55.670559 -Ocrd-Identifier: bar -Payload-Oxum: 1842.2 diff --git a/tests/data/bagger-conflict-workspace.ocrd/bagit.txt b/tests/data/bagger-conflict-workspace.ocrd/bagit.txt deleted file mode 100644 index 33835cda7..000000000 --- a/tests/data/bagger-conflict-workspace.ocrd/bagit.txt +++ /dev/null @@ -1,2 +0,0 @@ -BagIt-Version: 1.0 -Tag-File-Character-Encoding: UTF-8 \ No newline at end of file diff --git a/tests/data/bagger-conflict-workspace.ocrd/data/A/name.ext b/tests/data/bagger-conflict-workspace.ocrd/data/A/name.ext deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/data/bagger-conflict-workspace.ocrd/data/mets.xml b/tests/data/bagger-conflict-workspace.ocrd/data/mets.xml deleted file mode 100644 index 7b0131b43..000000000 --- a/tests/data/bagger-conflict-workspace.ocrd/data/mets.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - ocrd/core v2.58.1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tests/data/bagger-conflict-workspace.ocrd/manifest-sha512.txt b/tests/data/bagger-conflict-workspace.ocrd/manifest-sha512.txt deleted file mode 100644 index 751cb26c0..000000000 --- a/tests/data/bagger-conflict-workspace.ocrd/manifest-sha512.txt +++ /dev/null @@ -1,2 +0,0 @@ -48e017c062dd5d8a47d11378e72ec6b95837c2e579090949593451d118b58a10ca34586469011191c63764dab10b3032d69f2951cca05239bd8adb2f17db5623 data/mets.xml -cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e data/A/name.ext diff --git a/tests/data/bagger-conflict-workspace.ocrd/tagmanifest-sha512.txt b/tests/data/bagger-conflict-workspace.ocrd/tagmanifest-sha512.txt deleted file mode 100644 index 36ef96773..000000000 --- a/tests/data/bagger-conflict-workspace.ocrd/tagmanifest-sha512.txt +++ /dev/null @@ -1,3 +0,0 @@ -da429ec4f5ca60aebd25aa4c26ccf51ae87609129ae76f09683c810da4af4414277250ecc0253bf7871c7a3f9c5b3ed6918a5d5c092a7dc394051601eb8bb3ab bagit.txt -679cb2bdba9444dd8a9ef4904d0f08127bc442f0e5977aff5444cc662d806cd6aa6b044cf0a9dcff21cd69879f6c8bf7f4e50ff0a799a9429528e58413489305 bag-info.txt -2db518ce1da5b54203965d43cfc5cca657bfae99c405a1c858854e7ad53f58d10d70bd0f200e66627395bb02cfcf01a1786924a4b571f01269b130de2a2e66ac manifest-sha512.txt From 15bd3f571a13d9e6562535c73c651dfb49676be6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 17 Nov 2023 16:06:18 +0100 Subject: [PATCH 2/5] WorkspaceBagger: Allow filtering fileGrps with -I/-X --- ocrd/ocrd/cli/zip.py | 8 +++- ocrd/ocrd/workspace_bagger.py | 25 +++++++++--- .../A/{name.ext => name.xml} | 0 .../B/{name.ext => name.xml} | 0 .../C/{name.ext => name.xml} | 0 tests/data/bagger-conflict-workspace/mets.xml | 6 +-- tests/validator/test_workspace_bagger.py | 40 +++++++++++++++++-- 7 files changed, 65 insertions(+), 14 deletions(-) rename tests/data/bagger-conflict-source/A/{name.ext => name.xml} (100%) rename tests/data/bagger-conflict-source/B/{name.ext => name.xml} (100%) rename tests/data/bagger-conflict-source/C/{name.ext => name.xml} (100%) diff --git a/ocrd/ocrd/cli/zip.py b/ocrd/ocrd/cli/zip.py index 4ab53bfd4..67ffb2958 100644 --- a/ocrd/ocrd/cli/zip.py +++ b/ocrd/ocrd/cli/zip.py @@ -38,13 +38,15 @@ def zip_cli(): default="mets.xml", help='Basename of the METS file.', show_default=True) +@click.option('-I', '--include-file-grps', help="fileGrps to include", default=[], multiple=True) +@click.option('-X', '--exclude-file-grps', help="fileGrps to include", default=[], multiple=True) @click.option('-i', '--identifier', '--id', help="Ocrd-Identifier", required=True) @click.option('-m', '--mets', help="location of mets.xml in the bag's data dir", default="mets.xml") @click.option('-b', '--base-version-checksum', help="Ocrd-Base-Version-Checksum") @click.option('-t', '--tag-file', help="Add a non-payload file to bag", type=click.Path(file_okay=True, dir_okay=False, readable=True, resolve_path=True), multiple=True) @click.option('-Z', '--skip-zip', help="Create a directory but do not ZIP it", is_flag=True, default=False) @click.option('-j', '--processes', help="Number of parallel processes", type=int, default=1) -def bag(directory, mets_basename, dest, identifier, mets, base_version_checksum, tag_file, skip_zip, processes): +def bag(directory, mets_basename, dest, include_file_grps, exclude_file_grps, identifier, mets, base_version_checksum, tag_file, skip_zip, processes): """ Bag workspace as OCRD-ZIP at DEST """ @@ -59,7 +61,9 @@ def bag(directory, mets_basename, dest, identifier, mets, base_version_checksum, ocrd_base_version_checksum=base_version_checksum, processes=processes, tag_files=tag_file, - skip_zip=skip_zip + skip_zip=skip_zip, + include_file_grps=include_file_grps, + exclude_file_grps=exclude_file_grps, ) # ---------------------------------------------------------------------- diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py index 8602311cc..d7363ec74 100644 --- a/ocrd/ocrd/workspace_bagger.py +++ b/ocrd/ocrd/workspace_bagger.py @@ -15,7 +15,6 @@ pushd_popd, getLogger, MIME_TO_EXT, - is_local_filename, unzip_file_to_dir, MIMETYPE_PAGE, @@ -56,7 +55,15 @@ def _log_or_raise(self, msg): else: log.info(msg) - def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes): + def _bag_mets_files( + self, + workspace, + bagdir, + ocrd_mets, + processes, + include_file_grps=[], + exclude_file_grps=[], + ): mets = workspace.mets changed_local_filenames = {} @@ -66,7 +73,13 @@ def _bag_mets_files(self, workspace, bagdir, ocrd_mets, processes): with pushd_popd(workspace.directory): # local_filenames of the files before changing for f in mets.find_files(): - log.info("Handling OcrdFile %s", f) + if include_file_grps and f.fileGrp not in include_file_grps: + log.info(f"Skipping {f} because it is not in include_file_grps {include_file_grps}") + continue + if exclude_file_grps and f.fileGrp in exclude_file_grps: + log.info(f"Skipping {f} because it is in exclude_file_grps {exclude_file_grps}") + continue + log.info("Bagging OcrdFile %s", f) file_grp_dir = Path(bagdir, 'data', f.fileGrp) if not file_grp_dir.is_dir(): @@ -130,7 +143,9 @@ def bag(self, ocrd_base_version_checksum=None, processes=1, skip_zip=False, - tag_files=None + tag_files=None, + include_file_grps=[], + exclude_file_grps=[], ): """ Bag a workspace @@ -170,7 +185,7 @@ def bag(self, f.write(BAGIT_TXT.encode('utf-8')) # create manifests - total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes) + total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, include_file_grps, exclude_file_grps) # create bag-info.txt bag = Bag(bagdir) diff --git a/tests/data/bagger-conflict-source/A/name.ext b/tests/data/bagger-conflict-source/A/name.xml similarity index 100% rename from tests/data/bagger-conflict-source/A/name.ext rename to tests/data/bagger-conflict-source/A/name.xml diff --git a/tests/data/bagger-conflict-source/B/name.ext b/tests/data/bagger-conflict-source/B/name.xml similarity index 100% rename from tests/data/bagger-conflict-source/B/name.ext rename to tests/data/bagger-conflict-source/B/name.xml diff --git a/tests/data/bagger-conflict-source/C/name.ext b/tests/data/bagger-conflict-source/C/name.xml similarity index 100% rename from tests/data/bagger-conflict-source/C/name.ext rename to tests/data/bagger-conflict-source/C/name.xml diff --git a/tests/data/bagger-conflict-workspace/mets.xml b/tests/data/bagger-conflict-workspace/mets.xml index ab23ccce1..ba0b85f83 100644 --- a/tests/data/bagger-conflict-workspace/mets.xml +++ b/tests/data/bagger-conflict-workspace/mets.xml @@ -18,13 +18,13 @@ - + - + - + diff --git a/tests/validator/test_workspace_bagger.py b/tests/validator/test_workspace_bagger.py index 2bf3c83c8..e4f821e0e 100644 --- a/tests/validator/test_workspace_bagger.py +++ b/tests/validator/test_workspace_bagger.py @@ -10,7 +10,7 @@ from ocrd.workspace import Workspace from ocrd.workspace_bagger import WorkspaceBagger, BACKUPDIR from ocrd.resolver import Resolver -from ocrd_utils import unzip_file_to_dir +from ocrd_utils import unzip_file_to_dir, pushd_popd, initLogging from pytest import fixture, raises @@ -208,9 +208,41 @@ def test_basename_conflict(tmpdir): bagger.bag(workspace, 'bagger-conflict-test', skip_zip=True, dest=str(tmpdir / 'bag')) assert len(workspace.mets.find_all_files()) == 3 assert len(list(Path(tmpdir, 'bag/data/A').iterdir())) == 3 - # import os - # os.system(f'find {tmpdir}') - # assert 0 + +def test_filter(tmpdir): + # initLogging() + bagger = WorkspaceBagger(Resolver()) + fgs = ['A', 'B', 'C', 'D', 'E'] + with pushd_popd(tmpdir): + Path('workspace').mkdir() + workspace = Resolver().workspace_from_nothing(tmpdir / 'workspace') + for fg in fgs: + Path(fg).mkdir() + for i in range(3): + workspace.add_file(fg, mimetype='foo/bar', file_id=f'{fg}_{i}', page_id=f'page{i}', local_filename=f'{fg}/file{i}.ext', content='') + # print([str(x) for x in workspace.mets.find_all_files()]) + + # test w/o filter + bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag1')) + for fg in fgs: + assert len(list(Path(f'bag1/data/{fg}').iterdir())) == 3 + + # test include + bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag2'), include_file_grps=['A']) + assert len(list(Path(f'bag2/data/A').iterdir())) == 3 + for fg in ['B', 'C', 'D', 'E']: + assert not Path(f'bag2/data/{fg}').exists() + + # test exclude + bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag3'), exclude_file_grps=['A']) + assert not Path(f'bag3/data/A').exists() + for fg in ['B', 'C', 'D', 'E']: + assert len(list(Path(f'bag3/data/{fg}').iterdir())) == 3 + + # test include + exclude + bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag4'), exclude_file_grps=['A'], include_file_grps=['A']) + for fg in fgs: + assert not Path(f'bag4/data/{fg}').exists() if __name__ == '__main__': main(__name__) From 1f6b9a37437d80443f06a538197e0a3a55e714ac Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 20 Nov 2023 13:52:12 +0100 Subject: [PATCH 3/5] Generalize {in,ex}clude_fileGrp to all file-searching functionality --- ocrd/ocrd/cli/workspace.py | 18 ++++++++++++---- ocrd/ocrd/cli/zip.py | 10 ++++----- ocrd/ocrd/decorators/mets_find_options.py | 2 ++ ocrd/ocrd/resolver.py | 4 +++- ocrd/ocrd/workspace_bagger.py | 18 ++++++---------- ocrd_models/ocrd_models/ocrd_mets.py | 25 +++++++++++++++++++++-- tests/model/test_ocrd_mets.py | 2 ++ tests/validator/test_workspace_bagger.py | 6 +++--- 8 files changed, 58 insertions(+), 27 deletions(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index a405f2318..beabe2b97 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -118,10 +118,11 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency @click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True) @click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning") @click.argument('mets_url') +@mets_find_options # XXX deprecated @click.argument('workspace_dir', default=None, required=False) @pass_workspace -def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir): +def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_url, workspace_dir): """ Create a workspace from METS_URL and return the directory @@ -140,6 +141,11 @@ def workspace_clone(ctx, clobber_mets, download, mets_url, workspace_dir): mets_basename=ctx.mets_basename, clobber_mets=clobber_mets, download=download, + ID=file_id, + pageId=page_id, + mimetype=mimetype, + include_fileGrp=include_fileGrp, + exclude_fileGrp=exclude_fileGrp, ) workspace.save_mets() print(workspace.directory) @@ -431,7 +437,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi @click.option('--undo-download', is_flag=True, help="Remove all downloaded files from the METS") @click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests") @pass_workspace -def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download, undo_download, wait): +def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, include_fileGrp, exclude_fileGrp, download, undo_download, wait): """ Find files. @@ -453,6 +459,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, down file_grp=file_grp, mimetype=mimetype, page_id=page_id, + include_fileGrp=include_fileGrp, + exclude_fileGrp=exclude_fileGrp, ): ret_entry = [f.ID if field == 'pageId' else str(getattr(f, field)) or '' for field in output_field] if download and not f.local_filename: @@ -660,7 +668,7 @@ def _handle_json_option(ctx, param, value): @click.option('--pageId-mapping', help="JSON object mapping src to dest page ID", callback=_handle_json_option) @mets_find_options @pass_workspace -def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, mets_path): # pylint: disable=redefined-builtin +def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pageid_mapping, file_grp, file_id, page_id, mimetype, include_fileGrp, exclude_fileGrp, mets_path): # pylint: disable=redefined-builtin """ Merges this workspace with the workspace that contains ``METS_PATH`` @@ -687,7 +695,9 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa file_grp=file_grp, file_id=file_id, page_id=page_id, - mimetype=mimetype + mimetype=mimetype, + include_fileGrp=include_fileGrp, + exclude_fileGrp=exclude_fileGrp, ) workspace.save_mets() diff --git a/ocrd/ocrd/cli/zip.py b/ocrd/ocrd/cli/zip.py index 67ffb2958..022fab39d 100644 --- a/ocrd/ocrd/cli/zip.py +++ b/ocrd/ocrd/cli/zip.py @@ -38,15 +38,15 @@ def zip_cli(): default="mets.xml", help='Basename of the METS file.', show_default=True) -@click.option('-I', '--include-file-grps', help="fileGrps to include", default=[], multiple=True) -@click.option('-X', '--exclude-file-grps', help="fileGrps to include", default=[], multiple=True) +@click.option('-q', '--include-file-grps', 'include_fileGrp', help="fileGrps to include", default=[], multiple=True) +@click.option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to include", default=[], multiple=True) @click.option('-i', '--identifier', '--id', help="Ocrd-Identifier", required=True) @click.option('-m', '--mets', help="location of mets.xml in the bag's data dir", default="mets.xml") @click.option('-b', '--base-version-checksum', help="Ocrd-Base-Version-Checksum") @click.option('-t', '--tag-file', help="Add a non-payload file to bag", type=click.Path(file_okay=True, dir_okay=False, readable=True, resolve_path=True), multiple=True) @click.option('-Z', '--skip-zip', help="Create a directory but do not ZIP it", is_flag=True, default=False) @click.option('-j', '--processes', help="Number of parallel processes", type=int, default=1) -def bag(directory, mets_basename, dest, include_file_grps, exclude_file_grps, identifier, mets, base_version_checksum, tag_file, skip_zip, processes): +def bag(directory, mets_basename, dest, include_fileGrp, exclude_fileGrp, identifier, mets, base_version_checksum, tag_file, skip_zip, processes): """ Bag workspace as OCRD-ZIP at DEST """ @@ -62,8 +62,8 @@ def bag(directory, mets_basename, dest, include_file_grps, exclude_file_grps, id processes=processes, tag_files=tag_file, skip_zip=skip_zip, - include_file_grps=include_file_grps, - exclude_file_grps=exclude_file_grps, + include_fileGrp=include_fileGrp, + exclude_fileGrp=exclude_fileGrp, ) # ---------------------------------------------------------------------- diff --git a/ocrd/ocrd/decorators/mets_find_options.py b/ocrd/ocrd/decorators/mets_find_options.py index c661af5d4..f1200102a 100644 --- a/ocrd/ocrd/decorators/mets_find_options.py +++ b/ocrd/ocrd/decorators/mets_find_options.py @@ -6,6 +6,8 @@ def mets_find_options(f): option('-m', '--mimetype', help="Media type to look for", metavar='FILTER'), option('-g', '--page-id', help="Page ID", metavar='FILTER'), option('-i', '--file-id', help="ID", metavar='FILTER'), + option('-q', '--include-file-grps', 'include_fileGrp', help="fileGrps to include", default=[], multiple=True), + option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to include", default=[], multiple=True), ]: opt(f) return f diff --git a/ocrd/ocrd/resolver.py b/ocrd/ocrd/resolver.py index 649a934e5..25f7507f1 100644 --- a/ocrd/ocrd/resolver.py +++ b/ocrd/ocrd/resolver.py @@ -158,6 +158,7 @@ def workspace_from_url( download=False, src_baseurl=None, mets_server_url=None, + **kwargs ): """ Create a workspace from a METS by URL (i.e. clone if :py:attr:`mets_url` is remote or :py:attr:`dst_dir` is given). @@ -172,6 +173,7 @@ def workspace_from_url( By default existing ``mets.xml`` will raise an exception. download (boolean, False): Whether to also download all the files referenced by the METS src_baseurl (string, None): Base URL for resolving relative file locations + **kwargs (): Passed on to ``OcrdMets.find_files`` if download == True Download (clone) :py:attr:`mets_url` to ``mets.xml`` in :py:attr:`dst_dir`, unless the former is already local and the latter is ``none`` or already identical to its directory name. @@ -217,7 +219,7 @@ def workspace_from_url( workspace = Workspace(self, dst_dir, mets_basename=mets_basename, baseurl=src_baseurl, mets_server_url=mets_server_url) if download: - for f in workspace.mets.find_files(): + for f in workspace.mets.find_files(**kwargs): workspace.download_file(f) return workspace diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py index d7363ec74..af6519f01 100644 --- a/ocrd/ocrd/workspace_bagger.py +++ b/ocrd/ocrd/workspace_bagger.py @@ -61,8 +61,8 @@ def _bag_mets_files( bagdir, ocrd_mets, processes, - include_file_grps=[], - exclude_file_grps=[], + include_fileGrp=None, + exclude_fileGrp=None, ): mets = workspace.mets changed_local_filenames = {} @@ -72,13 +72,7 @@ def _bag_mets_files( with pushd_popd(workspace.directory): # local_filenames of the files before changing - for f in mets.find_files(): - if include_file_grps and f.fileGrp not in include_file_grps: - log.info(f"Skipping {f} because it is not in include_file_grps {include_file_grps}") - continue - if exclude_file_grps and f.fileGrp in exclude_file_grps: - log.info(f"Skipping {f} because it is in exclude_file_grps {exclude_file_grps}") - continue + for f in mets.find_files(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp): log.info("Bagging OcrdFile %s", f) file_grp_dir = Path(bagdir, 'data', f.fileGrp) @@ -144,8 +138,8 @@ def bag(self, processes=1, skip_zip=False, tag_files=None, - include_file_grps=[], - exclude_file_grps=[], + include_fileGrp=None, + exclude_fileGrp=None, ): """ Bag a workspace @@ -185,7 +179,7 @@ def bag(self, f.write(BAGIT_TXT.encode('utf-8')) # create manifests - total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, include_file_grps, exclude_file_grps) + total_bytes, total_files = self._bag_mets_files(workspace, bagdir, ocrd_mets, processes, include_fileGrp, exclude_fileGrp) # create bag-info.txt bag = Bag(bagdir) diff --git a/ocrd_models/ocrd_models/ocrd_mets.py b/ocrd_models/ocrd_models/ocrd_mets.py index 457e19324..3319f8f6f 100644 --- a/ocrd_models/ocrd_models/ocrd_mets.py +++ b/ocrd_models/ocrd_models/ocrd_mets.py @@ -236,7 +236,18 @@ def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) # pylint: disable=multiple-statements - def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None, local_filename=None, local_only=False): + def find_files( + self, + ID=None, + fileGrp=None, + pageId=None, + mimetype=None, + url=None, + local_filename=None, + local_only=False, + include_fileGrp=None, + exclude_fileGrp=None, + ): """ Search ``mets:file`` entries in this METS document and yield results. The :py:attr:`ID`, :py:attr:`pageId`, :py:attr:`fileGrp`, @@ -257,6 +268,8 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None local_filename (string) : ``@xlink:href`` local/cached filename of ``mets:Flocat`` of ``mets:file`` mimetype (string) : ``@MIMETYPE`` of ``mets:file`` local (boolean) : Whether to restrict results to local files in the filesystem + include_fileGrp (list[str]) : Whitelist of allowd file groups + exclude_fileGrp (list[str]) : Blacklist of disallowd file groups Yields: :py:class:`ocrd_models:ocrd_file:OcrdFile` instantiations """ @@ -351,7 +364,15 @@ def find_files(self, ID=None, fileGrp=None, pageId=None, mimetype=None, url=None if is_local is None: continue - yield OcrdFile(cand, mets=self) + ret = OcrdFile(cand, mets=self) + + # XXX include_fileGrp is redundant to fileGrp but for completeness + if exclude_fileGrp and ret.fileGrp in exclude_fileGrp: + continue + if include_fileGrp and ret.fileGrp not in include_fileGrp: + continue + + yield ret def add_file_group(self, fileGrp): """ diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 1465098af..64ea1eccf 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -78,7 +78,9 @@ def test_file_groups(sbb_sample_01): def test_find_all_files(sbb_sample_01): assert len(sbb_sample_01.find_all_files()) == 35, '35 files total' assert len(sbb_sample_01.find_all_files(fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' + assert len(sbb_sample_01.find_all_files(include_fileGrp='OCR-D-IMG')) == 3, '3 files in "OCR-D-IMG"' assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*')) == 13, '13 files in "//OCR-D-I.*"' + assert len(sbb_sample_01.find_all_files(fileGrp='//OCR-D-I.*', exclude_fileGrp=['OCR-D-IMG'])) == 10, '10 files in "//OCR-D-I.*" sans OCR-D-IMG' assert len(sbb_sample_01.find_all_files(ID="FILE_0001_IMAGE")) == 1, '1 files with ID "FILE_0001_IMAGE"' assert len(sbb_sample_01.find_all_files(ID="//FILE_0005_.*")) == 1, '1 files with ID "//FILE_0005_.*"' assert len(sbb_sample_01.find_all_files(pageId='PHYS_0001')) == 17, '17 files for page "PHYS_0001"' diff --git a/tests/validator/test_workspace_bagger.py b/tests/validator/test_workspace_bagger.py index e4f821e0e..3cd0fb621 100644 --- a/tests/validator/test_workspace_bagger.py +++ b/tests/validator/test_workspace_bagger.py @@ -228,19 +228,19 @@ def test_filter(tmpdir): assert len(list(Path(f'bag1/data/{fg}').iterdir())) == 3 # test include - bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag2'), include_file_grps=['A']) + bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag2'), include_fileGrp=['A']) assert len(list(Path(f'bag2/data/A').iterdir())) == 3 for fg in ['B', 'C', 'D', 'E']: assert not Path(f'bag2/data/{fg}').exists() # test exclude - bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag3'), exclude_file_grps=['A']) + bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag3'), exclude_fileGrp=['A']) assert not Path(f'bag3/data/A').exists() for fg in ['B', 'C', 'D', 'E']: assert len(list(Path(f'bag3/data/{fg}').iterdir())) == 3 # test include + exclude - bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag4'), exclude_file_grps=['A'], include_file_grps=['A']) + bagger.bag(workspace, 'foo', skip_zip=True, dest=str(tmpdir / 'bag4'), exclude_fileGrp=['A'], include_fileGrp=['A']) for fg in fgs: assert not Path(f'bag4/data/{fg}').exists() From 2c1faec92787e16b504aa011d873f6cd8e7a059d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 20 Nov 2023 14:20:51 +0100 Subject: [PATCH 4/5] WorkspaceValidator: Support {in,ex}clude_fileGrp --- .../ocrd_validators/workspace_validator.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/ocrd_validators/ocrd_validators/workspace_validator.py b/ocrd_validators/ocrd_validators/workspace_validator.py index 22a34c723..4061cd887 100644 --- a/ocrd_validators/ocrd_validators/workspace_validator.py +++ b/ocrd_validators/ocrd_validators/workspace_validator.py @@ -62,7 +62,9 @@ def check_file_grp(workspace, input_file_grp=None, output_file_grp=None, page_id return report def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False, - page_strictness='strict', page_coordinate_consistency='poly'): + page_strictness='strict', page_coordinate_consistency='poly', + include_fileGrp=None, exclude_fileGrp=None + ): """ Construct a new WorkspaceValidator. @@ -80,6 +82,8 @@ def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False, * `"baseline"`: Baseline in TextLine * `"both"`: both `poly` and `baseline` checks * `"off"`: no coordinate checks + include_fileGrp (list[str]): filegrp whitelist + exclude_fileGrp (list[str]): filegrp blacklist """ self.report = ValidationReport() self.skip = skip if skip else [] @@ -97,6 +101,7 @@ def __init__(self, resolver, mets_url, src_dir=None, skip=None, download=False, if 'mets_fileid_page_pcgtsid' not in self.skip: self.page_checks.append('pcgtsid') + self.find_kwargs = dict(include_fileGrp=include_fileGrp, exclude_fileGrp=exclude_fileGrp) self.src_dir = src_dir self.workspace = None self.mets = None @@ -184,14 +189,14 @@ def _validate_imagefilename(self): Validate that the imageFilename is correctly set to a filename relative to the workspace """ self.log.debug('_validate_imagefilename') - for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): + for f in self.mets.find_files(mimetype=MIMETYPE_PAGE, **self.find_kwargs): if not f.local_filename and not self.download: self.log.warning("Not available locally and 'download' is not set: %s", f) continue self.workspace.download_file(f) page = page_from_file(f).get_Page() imageFilename = page.imageFilename - if not self.mets.find_files(url=imageFilename): + if not self.mets.find_files(url=imageFilename, **self.find_kwargs): self.report.add_error("PAGE-XML %s : imageFilename '%s' not found in METS" % (f.local_filename, imageFilename)) if is_local_filename(imageFilename) and not Path(imageFilename).exists(): self.report.add_warning("PAGE-XML %s : imageFilename '%s' points to non-existent local file" % (f.local_filename, imageFilename)) @@ -201,7 +206,7 @@ def _validate_dimension(self): Validate image height and PAGE imageHeight match """ self.log.info('_validate_dimension') - for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): + for f in self.mets.find_files(mimetype=MIMETYPE_PAGE, **self.find_kwargs): if not f.local_filename and not self.download: self.log.warning("Not available locally and 'download' is not set: %s", f) continue @@ -220,7 +225,7 @@ def _validate_multipage(self): See `spec `_. """ self.log.debug('_validate_multipage') - for f in self.mets.find_files(mimetype='//image/.*'): + for f in self.mets.find_files(mimetype='//image/.*', **self.find_kwargs): if not f.local_filename and not self.download: self.log.warning("Not available locally and 'download' is not set: %s", f) continue @@ -240,7 +245,7 @@ def _validate_pixel_density(self): See `spec `_. """ self.log.debug('_validate_pixel_density') - for f in self.mets.find_files(mimetype='//image/.*'): + for f in self.mets.find_files(mimetype='//image/.*', **self.find_kwargs): if not f.local_filename and not self.download: self.log.warning("Not available locally and 'download' is not set: %s", f) continue @@ -282,10 +287,10 @@ def _validate_mets_files(self): """ self.log.debug('_validate_mets_files') try: - next(self.mets.find_files()) + next(self.mets.find_files(**self.find_kwargs)) except StopIteration: self.report.add_error("No files") - for f in self.mets.find_files(): + for f in self.mets.find_files(**self.find_kwargs): if f._el.get('GROUPID'): # pylint: disable=protected-access self.report.add_notice("File '%s' has GROUPID attribute - document might need an update" % f.ID) if not (f.url or f.local_filename): @@ -303,7 +308,7 @@ def _validate_page(self): Run PageValidator on the PAGE-XML documents referenced in the METS. """ self.log.debug('_validate_page') - for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): + for f in self.mets.find_files(mimetype=MIMETYPE_PAGE, **self.find_kwargs): if not f.local_filename and not self.download: self.log.warning("Not available locally and 'download' is not set: %s", f) continue @@ -322,7 +327,7 @@ def _validate_page_xsd(self): Validate all PAGE-XML files against PAGE XSD schema """ self.log.debug('_validate_page_xsd') - for f in self.mets.find_files(mimetype=MIMETYPE_PAGE): + for f in self.mets.find_files(mimetype=MIMETYPE_PAGE, **self.find_kwargs): if not f.local_filename and not self.download: self.log.warning("Not available locally and 'download' is not set: %s", f) continue From ab1ab4f4a2c9764f11b1cf06aec81801a41955a8 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 20 Nov 2023 18:01:50 +0100 Subject: [PATCH 5/5] typo {in,ex} Badly copy/pasted, hooray for code reviews Co-authored-by: Mehmed Mustafa --- ocrd/ocrd/cli/zip.py | 2 +- ocrd/ocrd/decorators/mets_find_options.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd/ocrd/cli/zip.py b/ocrd/ocrd/cli/zip.py index 022fab39d..d36bfc85b 100644 --- a/ocrd/ocrd/cli/zip.py +++ b/ocrd/ocrd/cli/zip.py @@ -39,7 +39,7 @@ def zip_cli(): help='Basename of the METS file.', show_default=True) @click.option('-q', '--include-file-grps', 'include_fileGrp', help="fileGrps to include", default=[], multiple=True) -@click.option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to include", default=[], multiple=True) +@click.option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to exclude", default=[], multiple=True) @click.option('-i', '--identifier', '--id', help="Ocrd-Identifier", required=True) @click.option('-m', '--mets', help="location of mets.xml in the bag's data dir", default="mets.xml") @click.option('-b', '--base-version-checksum', help="Ocrd-Base-Version-Checksum") diff --git a/ocrd/ocrd/decorators/mets_find_options.py b/ocrd/ocrd/decorators/mets_find_options.py index f1200102a..f604605d3 100644 --- a/ocrd/ocrd/decorators/mets_find_options.py +++ b/ocrd/ocrd/decorators/mets_find_options.py @@ -7,7 +7,7 @@ def mets_find_options(f): option('-g', '--page-id', help="Page ID", metavar='FILTER'), option('-i', '--file-id', help="ID", metavar='FILTER'), option('-q', '--include-file-grps', 'include_fileGrp', help="fileGrps to include", default=[], multiple=True), - option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to include", default=[], multiple=True), + option('-Q', '--exclude-file-grps', 'exclude_fileGrp', help="fileGrps to exclude", default=[], multiple=True), ]: opt(f) return f