From 8f562231bd099219ec75bbb5a85cfdb9ee1d6022 Mon Sep 17 00:00:00 2001 From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com> Date: Fri, 25 Oct 2024 00:37:27 +0000 Subject: [PATCH] process-text: add ocrmypdf --- pyproject.toml | 1 + xklb/mediafiles/process_media.py | 6 +++--- xklb/mediafiles/process_text.py | 35 ++++++++++++++++++++++++++----- xklb/utils/arggroups.py | 36 ++++++++++++++++++++++++++++++++ xklb/utils/web.py | 2 +- 5 files changed, 71 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a2c5606d..80431a87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ deluxe = [ "openpyxl", "PyExifTool", "pymcdm", + "ocrmypdf", "pyvirtualdisplay", "scikit-learn", "selenium-wire", diff --git a/xklb/mediafiles/process_media.py b/xklb/mediafiles/process_media.py index 1384cfe4..e9d562d1 100644 --- a/xklb/mediafiles/process_media.py +++ b/xklb/mediafiles/process_media.py @@ -1,9 +1,7 @@ -import argparse, math, os, sqlite3 +import argparse, concurrent.futures, math, os, sqlite3 from contextlib import suppress from shutil import which -import concurrent.futures - from xklb import usage from xklb.mediadb import db_history from xklb.mediafiles import process_ffmpeg, process_image, process_text @@ -78,6 +76,7 @@ def parse_args() -> argparse.Namespace: arggroups.process_ffmpeg(parser) arggroups.clobber(parser) + arggroups.ocrmypdf(parser) arggroups.debug(parser) arggroups.database_or_paths(parser) @@ -86,6 +85,7 @@ def parse_args() -> argparse.Namespace: arggroups.sql_fs_post(args) arggroups.process_ffmpeg_post(args) + arggroups.ocrmypdf_post(args) return args diff --git a/xklb/mediafiles/process_text.py b/xklb/mediafiles/process_text.py index 8498908d..8e1ab012 100644 --- a/xklb/mediafiles/process_text.py +++ b/xklb/mediafiles/process_text.py @@ -18,6 +18,7 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--clean-path", action=argparse.BooleanOptionalAction, default=True, help="Clean output path") arggroups.clobber(parser) + arggroups.ocrmypdf(parser) arggroups.requests(parser) arggroups.download(parser) arggroups.debug(parser) @@ -26,18 +27,21 @@ def parse_args() -> argparse.Namespace: args = parser.parse_intermixed_args() arggroups.args_post(args, parser) + arggroups.ocrmypdf_post(args) return args + def get_calibre_version(): - result = processes.cmd('ebook-convert', '--version') + result = processes.cmd("ebook-convert", "--version") version_string = result.stdout - version_part = version_string.split('(')[1].split(')')[0].split()[1] + version_part = version_string.split("(")[1].split(")")[0].split()[1] - major, minor, patch = map(int, version_part.split('.')) + major, minor, patch = map(int, version_part.split(".")) return (major, minor, patch) + def update_references(path, replacements): try: with open(path, "r", encoding="utf-8") as file: @@ -64,6 +68,27 @@ def process_path(args, path): if ext not in consts.CALIBRE_EXTENSIONS: return path + if ext == "pdf" and not args.no_ocr: + import ocrmypdf, ocrmypdf.exceptions + + if not ocrmypdf.pdfa.file_claims_pdfa(Path(path))["pass"]: + try: + if args.skip_text: + result = ocrmypdf.ocr(path, path, skip_text=True) + elif args.redo_ocr: + result = ocrmypdf.ocr(path, path, redo_ocr=True) + elif args.force_ocr: + result = ocrmypdf.ocr(path, path, force_ocr=True) + log.debug(result) + except ocrmypdf.exceptions.EncryptedPdfError: + log.info("[%s]: Skipped PDF OCR because it is encrypted", path) + except ocrmypdf.exceptions.DigitalSignatureError: + log.info("[%s]: Skipped PDF because it has a digital signature", path) + except (ocrmypdf.exceptions.TaggedPDFError, ocrmypdf.exceptions.PriorOcrFoundError): + log.info("[%s]: Skipped PDF because it already contained text", path) + except Exception as e: + log.warning("[%s]: Could not run OCR. %s", path, e) + p = Path(path) output_path = p.parent output_path /= p.stem + ".OEB" @@ -111,7 +136,7 @@ def process_path(args, path): ] if get_calibre_version() >= (7, 19, 0): - command += ['--pdf-engine', 'pdftohtml'] + command += ["--pdf-engine", "pdftohtml"] if args.simulate: print(shlex.join(command)) @@ -120,7 +145,7 @@ def process_path(args, path): try: processes.cmd(*command) except subprocess.CalledProcessError: - log.exception('[%s]: Calibre failed to process book. Skipping...', str(path)) + log.exception("[%s]: Calibre failed to process book. Skipping...", str(path)) return path if not output_path.exists() or path_utils.is_empty_folder(output_path): diff --git a/xklb/utils/arggroups.py b/xklb/utils/arggroups.py index d93afaca..3d1c5290 100644 --- a/xklb/utils/arggroups.py +++ b/xklb/utils/arggroups.py @@ -1,5 +1,6 @@ import argparse, os, re, textwrap, typing from pathlib import Path +from shutil import which from xklb.utils import ( arg_utils, @@ -1849,3 +1850,38 @@ def history(parser): action="store_true", help="Exclude completely watched media", ) + + +def ocrmypdf(parent_parser): + parser = parent_parser.add_argument_group("OCRMyPDF") + mode = parser.add_mutually_exclusive_group() + mode.add_argument("--no-ocr", action="store_true", help="Skip OCR") + mode.add_argument( + "--force-ocr", + action="store_true", + help="Rasterize any text or vector objects on each page, apply OCR, and " + "save the rastered output (this rewrites the PDF)", + ) + mode.add_argument( + "--skip-text", + action="store_true", + help="Skip OCR on any pages that already contain text, but include the " + "page in final output; useful for PDFs that contain a mix of " + "images, text pages, and/or previously OCRed pages", + ) + mode.add_argument( + "--redo-ocr", + action="store_true", + help="Attempt to detect and remove the hidden OCR layer from files that " + "were previously OCRed with OCRmyPDF or another program. Apply OCR " + "to text found in raster images. Existing visible text objects will " + "not be changed. If there is no existing OCR, OCR will be added.", + ) + + +def ocrmypdf_post(args): + if not any([args.no_ocr, args.force_ocr, args.skip_text, args.redo_ocr]): + if which("tesseract") and which("gs"): + args.skip_text = True + else: + args.no_ocr = True diff --git a/xklb/utils/web.py b/xklb/utils/web.py index eb172dc2..dfa8ca37 100644 --- a/xklb/utils/web.py +++ b/xklb/utils/web.py @@ -421,7 +421,7 @@ def url_to_local_path(url, response=None, output_path=None, output_prefix=None): return output_path -def download_url(args, url, output_path=None, retry_num=0): +def download_url(args, url: str, output_path=None, retry_num=0) -> str | None: global session if session is None: log.warning("Creating new web.session")