Skip to content

Commit

Permalink
Merge remote-tracking branch 'bertsky/workspace-clean'
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Jun 7, 2024
2 parents a1fd8bc + 2291d33 commit 3d49d21
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 3 deletions.
60 changes: 57 additions & 3 deletions src/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
:nested: full
"""
import os
from os import getcwd, unlink
from os.path import relpath, exists, join, isabs
from os import getcwd, rmdir, unlink
from os.path import dirname, relpath, normpath, exists, join, isabs, isdir
from pathlib import Path
from json import loads, dumps
import sys
Expand Down Expand Up @@ -164,7 +164,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
@pass_workspace
def workspace_init(ctx, clobber_mets, directory):
"""
Create a workspace with an empty METS file in --directory.
Create a workspace with an empty METS file in DIRECTORY or CWD.
"""
LOG = getLogger('ocrd.cli.workspace.init')
Expand Down Expand Up @@ -585,6 +585,60 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
raise(e)
workspace.save_mets()

# ----------------------------------------------------------------------
# ocrd workspace clean
# ----------------------------------------------------------------------

@workspace_cli.command('clean')
@click.option('-n', '--dry-run', help="Don't actually do anything to the filesystem, just preview", default=False, is_flag=True)
@click.option('-d', '--directories', help="Remove untracked directories in addition to untracked files", default=False, is_flag=True)
@click.argument('path_glob', nargs=-1, required=False)
@pass_workspace
def clean(ctx, dry_run, directories, path_glob):
"""
Removes files and directories from the workspace that are not
referenced by any mets:files.
PATH_GLOB can be a shell glob expression to match file names,
directory names (recursively), or plain paths. All paths are
resolved w.r.t. the workspace.
If no PATH_GLOB are specified, then all files and directories
may match.
"""
log = getLogger('ocrd.cli.workspace.clean')
workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
allowed_dirs = set(dirname(path) for path in allowed_files)
with pushd_popd(workspace.directory):
if len(path_glob):
paths = []
for expression in path_glob:
if isabs(expression):
expression = relpath(expression)
paths += glob(expression, recursive=True) or [expression]
else:
paths = glob('**', recursive=True)
file_paths = [path for path in paths if not isdir(path)]
for path in file_paths:
if normpath(path) in allowed_files:
continue
if dry_run:
log.info('unlink(%s)' % path)
else:
unlink(path)
if not directories:
return
dir_paths = [path for path in paths if isdir(path)]
for path in sorted(dir_paths, key=lambda p: p.count('/'), reverse=True):
if normpath(path) in allowed_dirs:
continue
if dry_run:
log.info('rmdir(%s)' % path)
else:
rmdir(path)

# ----------------------------------------------------------------------
# ocrd workspace list-group
# ----------------------------------------------------------------------
Expand Down
17 changes: 17 additions & 0 deletions tests/cli/test_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,23 @@ def test_prune_files(self):
ws2 = self.resolver.workspace_from_url(join(tempdir, 'ws', 'mets.xml'))
self.assertEqual(len(ws2.mets.find_all_files()), 29)

def test_clean(self):
with TemporaryDirectory() as tempdir:
copytree(assets.path_to('SBB0000F29300010000/data'), join(tempdir, 'ws'))

ws1 = self.resolver.workspace_from_url(join(tempdir, 'ws', 'mets.xml'))
self.assertEqual(len(ws1.mets.find_all_files(local_only=True)), 29)
wsdir = Path(ws1.directory)
(wsdir/'foo').touch()
assert len([f for f in wsdir.rglob('*') if f.is_file()]) == 10 # 8 non-repeating files + mets_one_file.xml + foo

result = self.runner.invoke(workspace_cli, ['-d', join(tempdir, 'ws'), 'clean'])
self.assertEqual(result.exit_code, 0)

ws2 = self.resolver.workspace_from_url(join(tempdir, 'ws', 'mets.xml'))
self.assertEqual(len(ws2.mets.find_all_files(local_only=True)), 29)
self.assertEqual(len([f for f in wsdir.rglob('*') if f.is_file()]), 8) # 8 files

def test_clone_into_nonexisting_dir(self):
"""
https://github.com/OCR-D/core/issues/330
Expand Down

0 comments on commit 3d49d21

Please sign in to comment.