diff --git a/pyproject.toml b/pyproject.toml index b842379f..bf1df805 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ requires-python = ">=3.11" [project.optional-dependencies] dev = ["black", "isort", "ssort"] -test = ["ruff", "pytest", "pytest-regressions", "freezegun", "pyfakefs"] +test = ["ruff", "pytest", "pytest-regressions", "freezegun", "pandas", "pyfakefs"] deluxe = [ "aiohttp", "annoy", diff --git a/xklb/fsdb/disk_usage.py b/xklb/fsdb/disk_usage.py index 2491490e..639eeabe 100644 --- a/xklb/fsdb/disk_usage.py +++ b/xklb/fsdb/disk_usage.py @@ -100,7 +100,8 @@ def get_data(args) -> list[dict]: if args.database: media = list(args.db.query(*sqlgroups.fs_sql(args, limit=None))) else: - args.paths = [p for p in args.paths if os.path.exists(p)] + if args.hide_deleted: + args.paths = [p for p in args.paths if os.path.exists(p)] media = arg_utils.gen_d(args) media = [d if "size" in d else file_utils.get_filesize(d) for d in media] diff --git a/xklb/mediadb/download_status.py b/xklb/mediadb/download_status.py index 8ea3af54..ce13a6e5 100644 --- a/xklb/mediadb/download_status.py +++ b/xklb/mediadb/download_status.py @@ -64,7 +64,7 @@ def download_status() -> None: extractor_stats[extractor_key]["never_attempted"] += 1 media = [{"extractor_key": extractor_key, **d} for extractor_key, d in extractor_stats.items()] - media = sorted(media, key=lambda x: (-x["never_attempted"], -x["retry_queued"], x["extractor_key"])) + media = sorted(media, key=lambda x: (-x["never_attempted"], -x["retry_queued"], x["extractor_key"] or 0)) media_printer.media_printer(args, media, units="extractors") diff --git a/xklb/mediafiles/images_to_pdf.py b/xklb/mediafiles/images_to_pdf.py index b4a6faaa..0a45e49f 100644 --- a/xklb/mediafiles/images_to_pdf.py +++ b/xklb/mediafiles/images_to_pdf.py @@ -12,7 +12,7 @@ def parse_args() -> argparse.Namespace: parser = argparse_utils.ArgumentParser(usage=usage.images_to_pdf) parser.add_argument( - "--delete-original", action=argparse.BooleanOptionalAction, default=False, help="Delete source files" + "--delete-original", action=argparse.BooleanOptionalAction, default=False, help="Delete source images" ) parser.add_argument("--output-path", "-o", help="Output PDF file (optional)") diff --git a/xklb/mediafiles/process_ffmpeg.py b/xklb/mediafiles/process_ffmpeg.py index 663eb921..525577b4 100644 --- a/xklb/mediafiles/process_ffmpeg.py +++ b/xklb/mediafiles/process_ffmpeg.py @@ -313,7 +313,7 @@ def process_path(args, path, **kwargs): output_path.name.replace(".%03d", ".000") ) # TODO: support / return multiple paths... - delete_original = args.delete_original + delete_larger = args.delete_larger delete_transcode = False if not output_path.exists(): @@ -321,12 +321,12 @@ def process_path(args, path, **kwargs): output_stats = output_path.stat() - # Never set delete_original to True. That setting comes from args and it is default True + # Never set delete_larger to True. That setting comes from args and it is default True transcode_invalid = False if output_stats.st_size == 0: transcode_invalid = True - elif output_stats.st_size > original_stats.st_size: - delete_original = False + elif delete_larger and output_stats.st_size > original_stats.st_size: + delete_larger = False delete_transcode = True else: try: @@ -344,15 +344,15 @@ def process_path(args, path, **kwargs): if args.delete_unplayable: delete_transcode = False else: - delete_original = False + delete_larger = False delete_transcode = True if video_stream and args.audio_only and not args.no_preserve_video: - delete_original = False + delete_larger = False if delete_transcode: output_path.unlink() return path - elif delete_original: + elif delete_larger: path.unlink() os.utime(output_path, (original_stats.st_atime, original_stats.st_mtime)) diff --git a/xklb/mediafiles/process_image.py b/xklb/mediafiles/process_image.py index e85b9c0a..b0367a39 100644 --- a/xklb/mediafiles/process_image.py +++ b/xklb/mediafiles/process_image.py @@ -15,7 +15,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--max-image-height", type=int, default=2400) parser.add_argument("--max-image-width", type=int, default=2400) parser.add_argument( - "--delete-original", action=argparse.BooleanOptionalAction, default=True, help="Delete source files" + "--delete-larger", "--delete-original", action=argparse.BooleanOptionalAction, default=True, help="Delete larger of transcode or original files" ) parser.add_argument("--clean-path", action=argparse.BooleanOptionalAction, default=True, help="Clean output path") arggroups.clobber(parser) @@ -96,11 +96,11 @@ def process_path(args, path): if not output_path.exists(): return path if path.exists else None - if original_stats.st_size > 0 and output_path.stat().st_size > original_stats.st_size: + if original_stats.st_size > 0 and args.delete_larger and output_path.stat().st_size > original_stats.st_size: output_path.unlink() # Remove transcode return path else: - if args.delete_original: + if args.delete_larger: path.unlink() # Remove original os.utime(output_path, (original_stats.st_atime, original_stats.st_mtime)) diff --git a/xklb/mediafiles/process_text.py b/xklb/mediafiles/process_text.py index 4e44e29b..03470008 100644 --- a/xklb/mediafiles/process_text.py +++ b/xklb/mediafiles/process_text.py @@ -14,7 +14,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument("--max-image-height", type=int, default=2400) parser.add_argument("--max-image-width", type=int, default=2400) parser.add_argument( - "--delete-original", action=argparse.BooleanOptionalAction, default=True, help="Delete source files" + "--delete-larger", "--delete-original", action=argparse.BooleanOptionalAction, default=True, help="Delete larger of transcode or original files" ) parser.add_argument("--clean-path", action=argparse.BooleanOptionalAction, default=True, help="Clean output path") arggroups.clobber(parser) @@ -94,7 +94,7 @@ def convert_to_text_pdf(args, path): log.warning("[%s]: Could not run OCR. %s", path, e) else: if os.path.exists(pdf_path): - if args.delete_original and not os.path.samefile(path, pdf_path): + if args.delete_larger and not os.path.samefile(path, pdf_path): os.unlink(path) path = pdf_path @@ -201,7 +201,7 @@ def process_path(args, path): image_paths = file_utils.rglob(str(output_path), consts.IMAGE_EXTENSIONS, quiet=True)[0] mp_image_args = argparse.Namespace( - **{k: v for k, v in args.__dict__.items() if k not in {"db"}} | {"delete_original": True} + **{k: v for k, v in args.__dict__.items() if k not in {"db"}} | {"delete_larger": True} ) with concurrent.futures.ThreadPoolExecutor() as executor: futures = { @@ -218,11 +218,11 @@ def process_path(args, path): update_references(text_path, replacements) # compare final output size - if path_utils.folder_size(output_path) > original_stats.st_size: + if args.delete_larger and path_utils.folder_size(output_path) > original_stats.st_size: devices.rmtree(args, output_path) # Remove transcode return path - if args.delete_original: + if args.delete_larger: path.unlink() # Remove original path_utils.folder_utime(output_path, (original_stats.st_atime, original_stats.st_mtime)) diff --git a/xklb/utils/arg_utils.py b/xklb/utils/arg_utils.py index 68062c90..4c04ac23 100644 --- a/xklb/utils/arg_utils.py +++ b/xklb/utils/arg_utils.py @@ -1,10 +1,12 @@ -import argparse, operator, random +import argparse, operator, os.path, random from collections import defaultdict from copy import copy from pathlib import Path +from typing import Generator from xklb.utils import consts, file_utils, iterables, nums, processes, strings from xklb.utils.consts import SC +from xklb.utils.log_utils import log def gen_paths(args, default_exts=None): @@ -15,7 +17,8 @@ def gen_paths(args, default_exts=None): for path in args.paths: json_data = strings.safe_json_loads(path) if isinstance(json_data, list): - yield from (d["path"] for d in json_data) + for d in json_data: + yield d["path"] elif isinstance(json_data, dict): yield json_data["path"] else: @@ -26,6 +29,11 @@ def gen_paths(args, default_exts=None): p = Path(path) if p.is_dir(): yield from file_utils.rglob(str(p), args.ext or default_exts, getattr(args, "exclude", None))[0] + elif args.hide_deleted: + if os.path.exists(p): + yield path + else: + log.info("Skipping non-existent file %s", path) else: yield path @@ -38,9 +46,18 @@ def gen_d(args, default_exts=None): for path in args.paths: json_data = strings.safe_json_loads(path) if isinstance(json_data, list): - yield from json_data + for json_item in json_data: + if args.hide_deleted: + if os.path.exists(json_item["path"]): + yield json_item + else: + yield json_item elif isinstance(json_data, dict): - yield json_data + if args.hide_deleted: + if os.path.exists(json_data["path"]): + yield json_data + else: + yield json_data else: raise TypeError else: @@ -50,6 +67,11 @@ def gen_d(args, default_exts=None): if p.is_dir(): for sp in file_utils.rglob(str(p), args.ext or default_exts, getattr(args, "exclude", None))[0]: yield {"path": sp} + elif args.hide_deleted: + if os.path.exists(p): + yield {"path": path} + else: + log.info("Skipping non-existent file %s", path) else: yield {"path": path} diff --git a/xklb/utils/arggroups.py b/xklb/utils/arggroups.py index 552f3f89..fe6ff10d 100644 --- a/xklb/utils/arggroups.py +++ b/xklb/utils/arggroups.py @@ -1327,7 +1327,7 @@ def process_ffmpeg(parent_parser): "--delete-no-audio", action="store_true", help="Delete files with no audio instead of transcoding video" ) parser.add_argument( - "--delete-original", action=argparse.BooleanOptionalAction, default=True, help="Delete source files" + "--delete-larger", "--delete-original", action=argparse.BooleanOptionalAction, default=True, help="Delete larger of transcode or original files" ) parser.add_argument("--clean-path", action=argparse.BooleanOptionalAction, default=True, help="Clean output path")