Skip to content

Commit

Permalink
use pdftohtml pdf engine
Browse files Browse the repository at this point in the history
  • Loading branch information
chapmanjacobd committed Oct 24, 2024
1 parent 03c0c20 commit 6d90b3f
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 2 deletions.
15 changes: 14 additions & 1 deletion xklb/mediafiles/process_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ def parse_args() -> argparse.Namespace:

return args

def get_calibre_version():
result = processes.cmd('ebook-convert', '--version')

version_string = result.stdout
version_part = version_string.split('(')[1].split(')')[0].split()[1]

major, minor, patch = map(int, version_part.split('.'))

return (major, minor, patch)

def update_references(path, replacements):
try:
Expand Down Expand Up @@ -101,14 +110,18 @@ def process_path(args, path):
# '--linearize-tables',
]

if get_calibre_version() >= (7, 19, 0):
command += ['--pdf-engine', 'pdftohtml']

if args.simulate:
print(shlex.join(command))
return path

try:
processes.cmd(*command)
except subprocess.CalledProcessError:
raise
log.exception('[%s]: Calibre failed to process book. Skipping...', str(path))
return path

if not output_path.exists() or path_utils.is_empty_folder(output_path):
output_path.unlink() # Remove transcode
Expand Down
9 changes: 8 additions & 1 deletion xklb/utils/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,14 @@ def reddit_frequency(frequency) -> str:
"|flv|insv|inx|swf|wma|wmv|exif|eip|psp|pspimage"
).split("|")
)
ARCHIVE_EXTENSIONS = set("zip|rar|cbz|cbr|cb7|tar|exe|7z|r00|iso|img|001".split("|"))
ARCHIVE_EXTENSIONS = set(
(
"0|1|01|001|0001|7z|ace|alz|alzip|arc|arj|b5i|b6i|bin|bz2|cab|cb7|cbr|cbz|ccd|cdr|cif"
"|cpio|daa|deb|dmg|exe|gi|gz|img|iso|lha|lzh|lzma|lzo|mdf|msi|nrg|nsi|nsis"
"|p01|pak|pdi|r00|rar|rpm|sit|sitx|tar|bz2|gz|xz|Z|taz|tbz2|tgz|toast|txz"
"|tz|udf|uif|vcd|wim|xar|xz|z|zip|zipx|zoo|zst"
).split("|")
)
CALIBRE_EXTENSIONS = set(
(
"azw|azw3|azw4|cbc|chm|djvu|docx|epub|fb2|fbz|htmlz|lit|lrf|mobi|odt|pdf|prc|pdb|pml|rb|rtf|snb|tcr|md|txtz"
Expand Down
2 changes: 2 additions & 0 deletions xklb/utils/processes.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,8 @@ def unar_out_path(archive_path):


def lsar(archive_path):
# TODO: seems a little slow. maybe compare perf with 7z or https://github.com/wummel/patool

if not which("lsar"):
log.error("[%s]: The 'lsar' command is not available. Install 'unar' to check archives", archive_path)
return []
Expand Down

0 comments on commit 6d90b3f

Please sign in to comment.