Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ocrd_mets: add get_physical_pages(for_pageIds=...) #1063

Merged
merged 25 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1e3e702
ocrd_mets: add get_physical_pages(for_pageIds=...)
bertsky Jun 26, 2023
07a9fe0
ocrd workspace list-page: --page-id option
bertsky Jun 26, 2023
25854c5
ocrd_mets: expose property physical_pages_labels
bertsky Jun 28, 2023
ccb51ce
ocrd workspace list-page: add --output-field, delegating to page labels
bertsky Jun 28, 2023
e181758
get phys pages returns strs or divs
MehmedGIT Jul 4, 2023
26b64c9
merge master and adapt to page-range output changes
kba Jan 15, 2024
073d9b0
update list-page-workspace with @ORDER
kba Jan 15, 2024
e91cf50
add typing info for caches in OcrdMets
kba Jan 15, 2024
c642d04
more complete test workspace for page labelling/partitioning
kba Jan 15, 2024
9dea95f
replace update-page with a cleaner solution based on get_physical_pages
kba Jan 15, 2024
cfd1c91
OcrdMets: extend the _page_cache to include all METS_PAGE_DIV_ATTRIBUTEs
kba Jan 16, 2024
ee8fb69
implement generic page attribute ranges
kba Jan 16, 2024
1427c07
utils.generate_range: raise a ValueError if non-numeric parts differ
kba Jan 17, 2024
c36360d
fix tests
kba Jan 17, 2024
3a60c1f
revert accidental commit to ocrd_utils/pyproject.toml
kba Jan 17, 2024
643d1ef
Merge branch 'master' into ocrd-mets-get-pages-for-pageids
kba Jan 30, 2024
517814b
get_physical_pages: return early if no patterns
kba Jan 30, 2024
1225912
OcrdMets.find_all_files: fix page attr loop
kba Feb 6, 2024
4a25d1e
OcrdMets.get_physical_pages should return IDs if not return_divs
kba Feb 8, 2024
466c61d
OcrdMets.get_physical_pages: Cache the attribute in the non-cached re…
kba Feb 8, 2024
9f84067
OcrdMets.get_physical_pages: raise ValueError if a pattern matches no…
kba Feb 8, 2024
2647831
OcrdMets.get_physical_pages: iterate over pages, then patterns in non…
kba Feb 8, 2024
28a1f18
adapt tests to stricter page pattern matching
kba Feb 8, 2024
c6cfe03
OcrdMets.get_physical_pages: raise ValueError if range start not matched
kba Feb 9, 2024
8e06532
Merge branch 'master' into ocrd-mets-get-pages-for-pageids
kba Feb 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 46 additions & 17 deletions ocrd/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,21 +370,22 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp
@workspace_cli.command('find')
@mets_find_options
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
default=['url'],
multiple=True,
type=click.Choice([
'url',
'mimetype',
'page_id',
'pageId',
'file_id',
'ID',
'file_grp',
'fileGrp',
'basename',
'basename_without_extension',
'local_filename',
]))
default=['url'],
show_default=True,
multiple=True,
type=click.Choice([
'url',
'mimetype',
'page_id',
'pageId',
'file_id',
'ID',
'file_grp',
'fileGrp',
'basename',
'basename_without_extension',
'local_filename',
]))
@click.option('--download', is_flag=True, help="Download found files to workspace and change location in METS file ")
@click.option('--wait', type=int, default=0, help="Wait this many seconds between download requests")
@pass_workspace
Expand Down Expand Up @@ -535,13 +536,41 @@ def list_groups(ctx):
# ----------------------------------------------------------------------

@workspace_cli.command('list-page')
@click.option('-g', '--page-id', help="Page ID", metavar='FILTER')
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
default=['ID'],
show_default=True,
multiple=True,
type=click.Choice([
'ID',
'ORDER',
'ORDERLABEL',
'LABEL',
]))
@pass_workspace
def list_pages(ctx):
def list_pages(ctx, page_id, output_field):
"""
List physical page IDs

(If any ``FILTER`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
print("\n".join(workspace.mets.physical_pages))
if page_id is None:
pages = workspace.mets.physical_pages
else:
pages = workspace.mets.get_physical_pages(for_pageIds=page_id)
if output_field == ['ID']:
print("\n".join(pages))
else:
labels = workspace.mets.physical_pages_labels
def field2label(page, field):
if field == 'ID':
return page
return labels[page][['ORDER', 'ORDERLABEL', 'LABEL'].index(field)]
for page in pages:
print("\t".join(field2label(page, field) or ''
for field in output_field))

# ----------------------------------------------------------------------
# ocrd workspace get-id
Expand Down
45 changes: 41 additions & 4 deletions ocrd_models/ocrd_models/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,15 +571,40 @@ def physical_pages(self):
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]/@ID',
namespaces=NS)

def get_physical_pages(self, for_fileIds=None):
def get_physical_pages(self, for_fileIds=None, for_pageIds=None):
"""
List all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``),
optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`.
optionally for a subset of ``mets:file`` ``@ID`` :py:attr:`for_fileIds`,
or for a subset selector expression (comma-separated, range, and/or regex) :py:attr:`for_pageIds`.
"""
if for_fileIds is None:
if for_fileIds is None and for_pageIds is None:
return self.physical_pages
if for_pageIds is not None:
ret = []
pageId_patterns = []
for pageId_token in re.split(r',', for_pageIds):
if pageId_token.startswith(REGEX_PREFIX):
pageId_patterns.append(re.compile(pageId_token[REGEX_PREFIX_LEN:]))
elif '..' in pageId_token:
pageId_patterns += generate_range(*pageId_token.split('..', 1))
else:
pageId_patterns += [pageId_token]
if self._cache_flag:
for page_id in self._page_cache.keys():
if page_id in pageId_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]):
ret.append(page_id)
else:
for page in self._tree.getroot().xpath(
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
namespaces=NS):
page_id = page.get('ID')
if page_id in pageId_patterns or \
any([isinstance(p, typing.Pattern) and p.fullmatch(page_id) for p in pageId_patterns]):
ret.append(page_id)
return ret

ret = [None] * len(for_fileIds)

if self._cache_flag:
for pageId in self._fptr_cache.keys():
for fptr in self._fptr_cache[pageId].keys():
Expand Down Expand Up @@ -728,6 +753,18 @@ def remove_physical_page_fptr(self, fileId):
mets_div.remove(mets_fptr)
return ret

@property
def physical_pages_labels(self):
"""
Map all page IDs (the ``@ID`` of each physical ``mets:structMap`` ``mets:div``) to their
``@ORDER``, ``@ORDERLABEL`` and ``@LABEL`` attributes, if any.
"""
divs = self._tree.getroot().xpath(
'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
namespaces=NS)
return {div.get('ID'): (div.get('ORDER', None), div.get('ORDERLABEL', None), div.get('LABEL', None))
for div in divs}

def merge(self, other_mets, force=False, fileGrp_mapping=None, fileId_mapping=None, pageId_mapping=None, after_add_cb=None, **kwargs):
"""
Add all files from other_mets.
Expand Down