From 8f562231bd099219ec75bbb5a85cfdb9ee1d6022 Mon Sep 17 00:00:00 2001
From: Jacob Chapman <7908073+chapmanjacobd@users.noreply.github.com>
Date: Fri, 25 Oct 2024 00:37:27 +0000
Subject: [PATCH] process-text: add ocrmypdf

---
 pyproject.toml                   |  1 +
 xklb/mediafiles/process_media.py |  6 +++---
 xklb/mediafiles/process_text.py  | 35 ++++++++++++++++++++++++++-----
 xklb/utils/arggroups.py          | 36 ++++++++++++++++++++++++++++++++
 xklb/utils/web.py                |  2 +-
 5 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a2c5606d..80431a87 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ deluxe = [
   "openpyxl",
   "PyExifTool",
   "pymcdm",
+  "ocrmypdf",
   "pyvirtualdisplay",
   "scikit-learn",
   "selenium-wire",
diff --git a/xklb/mediafiles/process_media.py b/xklb/mediafiles/process_media.py
index 1384cfe4..e9d562d1 100644
--- a/xklb/mediafiles/process_media.py
+++ b/xklb/mediafiles/process_media.py
@@ -1,9 +1,7 @@
-import argparse, math, os, sqlite3
+import argparse, concurrent.futures, math, os, sqlite3
 from contextlib import suppress
 from shutil import which
 
-import concurrent.futures
-
 from xklb import usage
 from xklb.mediadb import db_history
 from xklb.mediafiles import process_ffmpeg, process_image, process_text
@@ -78,6 +76,7 @@ def parse_args() -> argparse.Namespace:
 
     arggroups.process_ffmpeg(parser)
     arggroups.clobber(parser)
+    arggroups.ocrmypdf(parser)
     arggroups.debug(parser)
 
     arggroups.database_or_paths(parser)
@@ -86,6 +85,7 @@ def parse_args() -> argparse.Namespace:
 
     arggroups.sql_fs_post(args)
     arggroups.process_ffmpeg_post(args)
+    arggroups.ocrmypdf_post(args)
 
     return args
 
diff --git a/xklb/mediafiles/process_text.py b/xklb/mediafiles/process_text.py
index 8498908d..8e1ab012 100644
--- a/xklb/mediafiles/process_text.py
+++ b/xklb/mediafiles/process_text.py
@@ -18,6 +18,7 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument("--clean-path", action=argparse.BooleanOptionalAction, default=True, help="Clean output path")
     arggroups.clobber(parser)
+    arggroups.ocrmypdf(parser)
     arggroups.requests(parser)
     arggroups.download(parser)
     arggroups.debug(parser)
@@ -26,18 +27,21 @@ def parse_args() -> argparse.Namespace:
     args = parser.parse_intermixed_args()
     arggroups.args_post(args, parser)
 
+    arggroups.ocrmypdf_post(args)
     return args
 
+
 def get_calibre_version():
-    result = processes.cmd('ebook-convert', '--version')
+    result = processes.cmd("ebook-convert", "--version")
 
     version_string = result.stdout
-    version_part = version_string.split('(')[1].split(')')[0].split()[1]
+    version_part = version_string.split("(")[1].split(")")[0].split()[1]
 
-    major, minor, patch = map(int, version_part.split('.'))
+    major, minor, patch = map(int, version_part.split("."))
 
     return (major, minor, patch)
 
+
 def update_references(path, replacements):
     try:
         with open(path, "r", encoding="utf-8") as file:
@@ -64,6 +68,27 @@ def process_path(args, path):
     if ext not in consts.CALIBRE_EXTENSIONS:
         return path
 
+    if ext == "pdf" and not args.no_ocr:
+        import ocrmypdf, ocrmypdf.exceptions
+
+        if not ocrmypdf.pdfa.file_claims_pdfa(Path(path))["pass"]:
+            try:
+                if args.skip_text:
+                    result = ocrmypdf.ocr(path, path, skip_text=True)
+                elif args.redo_ocr:
+                    result = ocrmypdf.ocr(path, path, redo_ocr=True)
+                elif args.force_ocr:
+                    result = ocrmypdf.ocr(path, path, force_ocr=True)
+                log.debug(result)
+            except ocrmypdf.exceptions.EncryptedPdfError:
+                log.info("[%s]: Skipped PDF OCR because it is encrypted", path)
+            except ocrmypdf.exceptions.DigitalSignatureError:
+                log.info("[%s]: Skipped PDF because it has a digital signature", path)
+            except (ocrmypdf.exceptions.TaggedPDFError, ocrmypdf.exceptions.PriorOcrFoundError):
+                log.info("[%s]: Skipped PDF because it already contained text", path)
+            except Exception as e:
+                log.warning("[%s]: Could not run OCR. %s", path, e)
+
     p = Path(path)
     output_path = p.parent
     output_path /= p.stem + ".OEB"
@@ -111,7 +136,7 @@ def process_path(args, path):
     ]
 
     if get_calibre_version() >= (7, 19, 0):
-        command += ['--pdf-engine', 'pdftohtml']
+        command += ["--pdf-engine", "pdftohtml"]
 
     if args.simulate:
         print(shlex.join(command))
@@ -120,7 +145,7 @@ def process_path(args, path):
     try:
         processes.cmd(*command)
     except subprocess.CalledProcessError:
-        log.exception('[%s]: Calibre failed to process book. Skipping...', str(path))
+        log.exception("[%s]: Calibre failed to process book. Skipping...", str(path))
         return path
 
     if not output_path.exists() or path_utils.is_empty_folder(output_path):
diff --git a/xklb/utils/arggroups.py b/xklb/utils/arggroups.py
index d93afaca..3d1c5290 100644
--- a/xklb/utils/arggroups.py
+++ b/xklb/utils/arggroups.py
@@ -1,5 +1,6 @@
 import argparse, os, re, textwrap, typing
 from pathlib import Path
+from shutil import which
 
 from xklb.utils import (
     arg_utils,
@@ -1849,3 +1850,38 @@ def history(parser):
         action="store_true",
         help="Exclude completely watched media",
     )
+
+
+def ocrmypdf(parent_parser):
+    parser = parent_parser.add_argument_group("OCRMyPDF")
+    mode = parser.add_mutually_exclusive_group()
+    mode.add_argument("--no-ocr", action="store_true", help="Skip OCR")
+    mode.add_argument(
+        "--force-ocr",
+        action="store_true",
+        help="Rasterize any text or vector objects on each page, apply OCR, and "
+        "save the rastered output (this rewrites the PDF)",
+    )
+    mode.add_argument(
+        "--skip-text",
+        action="store_true",
+        help="Skip OCR on any pages that already contain text, but include the "
+        "page in final output; useful for PDFs that contain a mix of "
+        "images, text pages, and/or previously OCRed pages",
+    )
+    mode.add_argument(
+        "--redo-ocr",
+        action="store_true",
+        help="Attempt to detect and remove the hidden OCR layer from files that "
+        "were previously OCRed with OCRmyPDF or another program. Apply OCR "
+        "to text found in raster images. Existing visible text objects will "
+        "not be changed. If there is no existing OCR, OCR will be added.",
+    )
+
+
+def ocrmypdf_post(args):
+    if not any([args.no_ocr, args.force_ocr, args.skip_text, args.redo_ocr]):
+        if which("tesseract") and which("gs"):
+            args.skip_text = True
+        else:
+            args.no_ocr = True
diff --git a/xklb/utils/web.py b/xklb/utils/web.py
index eb172dc2..dfa8ca37 100644
--- a/xklb/utils/web.py
+++ b/xklb/utils/web.py
@@ -421,7 +421,7 @@ def url_to_local_path(url, response=None, output_path=None, output_prefix=None):
     return output_path
 
 
-def download_url(args, url, output_path=None, retry_num=0):
+def download_url(args, url: str, output_path=None, retry_num=0) -> str | None:
     global session
     if session is None:
         log.warning("Creating new web.session")