From 6d90b3f42d3f2b23da0d58782e36699d1a5366c0 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Thu, 24 Oct 2024 16:25:43 +0000 Subject: [PATCH] use pdftohtml pdf engine --- xklb/mediafiles/process_text.py | 15 ++++++++++++++- xklb/utils/consts.py | 9 ++++++++- xklb/utils/processes.py | 2 ++ 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/xklb/mediafiles/process_text.py b/xklb/mediafiles/process_text.py index 2797719d..8498908d 100644 --- a/xklb/mediafiles/process_text.py +++ b/xklb/mediafiles/process_text.py @@ -28,6 +28,15 @@ def parse_args() -> argparse.Namespace: return args +def get_calibre_version(): + result = processes.cmd('ebook-convert', '--version') + + version_string = result.stdout + version_part = version_string.split('(')[1].split(')')[0].split()[1] + + major, minor, patch = map(int, version_part.split('.')) + + return (major, minor, patch) def update_references(path, replacements): try: @@ -101,6 +110,9 @@ def process_path(args, path): # '--linearize-tables', ] + if get_calibre_version() >= (7, 19, 0): + command += ['--pdf-engine', 'pdftohtml'] + if args.simulate: print(shlex.join(command)) return path @@ -108,7 +120,8 @@ def process_path(args, path): try: processes.cmd(*command) except subprocess.CalledProcessError: - raise + log.exception('[%s]: Calibre failed to process book. Skipping...', str(path)) + return path if not output_path.exists() or path_utils.is_empty_folder(output_path): output_path.unlink() # Remove transcode diff --git a/xklb/utils/consts.py b/xklb/utils/consts.py index c646328c..c708e13a 100644 --- a/xklb/utils/consts.py +++ b/xklb/utils/consts.py @@ -217,7 +217,14 @@ def reddit_frequency(frequency) -> str: "|flv|insv|inx|swf|wma|wmv|exif|eip|psp|pspimage" ).split("|") ) -ARCHIVE_EXTENSIONS = set("zip|rar|cbz|cbr|cb7|tar|exe|7z|r00|iso|img|001".split("|")) +ARCHIVE_EXTENSIONS = set( + ( + "0|1|01|001|0001|7z|ace|alz|alzip|arc|arj|b5i|b6i|bin|bz2|cab|cb7|cbr|cbz|ccd|cdr|cif" + "|cpio|daa|deb|dmg|exe|gi|gz|img|iso|lha|lzh|lzma|lzo|mdf|msi|nrg|nsi|nsis" + "|p01|pak|pdi|r00|rar|rpm|sit|sitx|tar|bz2|gz|xz|Z|taz|tbz2|tgz|toast|txz" + "|tz|udf|uif|vcd|wim|xar|xz|z|zip|zipx|zoo|zst" + ).split("|") +) CALIBRE_EXTENSIONS = set( ( "azw|azw3|azw4|cbc|chm|djvu|docx|epub|fb2|fbz|htmlz|lit|lrf|mobi|odt|pdf|prc|pdb|pml|rb|rtf|snb|tcr|md|txtz" diff --git a/xklb/utils/processes.py b/xklb/utils/processes.py index 42bdb411..e4242495 100644 --- a/xklb/utils/processes.py +++ b/xklb/utils/processes.py @@ -369,6 +369,8 @@ def unar_out_path(archive_path): def lsar(archive_path): + # TODO: seems a little slow. maybe compare perf with 7z or https://github.com/wummel/patool + if not which("lsar"): log.error("[%s]: The 'lsar' command is not available. Install 'unar' to check archives", archive_path) return []