Skip to content

Commit

Permalink
process-text: add ocrmypdf
Browse files Browse the repository at this point in the history
  • Loading branch information
chapmanjacobd committed Oct 25, 2024
1 parent 6d90b3f commit 8f56223
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 9 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ deluxe = [
"openpyxl",
"PyExifTool",
"pymcdm",
"ocrmypdf",
"pyvirtualdisplay",
"scikit-learn",
"selenium-wire",
Expand Down
6 changes: 3 additions & 3 deletions xklb/mediafiles/process_media.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import argparse, math, os, sqlite3
import argparse, concurrent.futures, math, os, sqlite3
from contextlib import suppress
from shutil import which

import concurrent.futures

from xklb import usage
from xklb.mediadb import db_history
from xklb.mediafiles import process_ffmpeg, process_image, process_text
Expand Down Expand Up @@ -78,6 +76,7 @@ def parse_args() -> argparse.Namespace:

arggroups.process_ffmpeg(parser)
arggroups.clobber(parser)
arggroups.ocrmypdf(parser)
arggroups.debug(parser)

arggroups.database_or_paths(parser)
Expand All @@ -86,6 +85,7 @@ def parse_args() -> argparse.Namespace:

arggroups.sql_fs_post(args)
arggroups.process_ffmpeg_post(args)
arggroups.ocrmypdf_post(args)

return args

Expand Down
35 changes: 30 additions & 5 deletions xklb/mediafiles/process_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def parse_args() -> argparse.Namespace:
)
parser.add_argument("--clean-path", action=argparse.BooleanOptionalAction, default=True, help="Clean output path")
arggroups.clobber(parser)
arggroups.ocrmypdf(parser)
arggroups.requests(parser)
arggroups.download(parser)
arggroups.debug(parser)
Expand All @@ -26,18 +27,21 @@ def parse_args() -> argparse.Namespace:
args = parser.parse_intermixed_args()
arggroups.args_post(args, parser)

arggroups.ocrmypdf_post(args)
return args


def get_calibre_version():
result = processes.cmd('ebook-convert', '--version')
result = processes.cmd("ebook-convert", "--version")

version_string = result.stdout
version_part = version_string.split('(')[1].split(')')[0].split()[1]
version_part = version_string.split("(")[1].split(")")[0].split()[1]

major, minor, patch = map(int, version_part.split('.'))
major, minor, patch = map(int, version_part.split("."))

return (major, minor, patch)


def update_references(path, replacements):
try:
with open(path, "r", encoding="utf-8") as file:
Expand All @@ -64,6 +68,27 @@ def process_path(args, path):
if ext not in consts.CALIBRE_EXTENSIONS:
return path

if ext == "pdf" and not args.no_ocr:
import ocrmypdf, ocrmypdf.exceptions

if not ocrmypdf.pdfa.file_claims_pdfa(Path(path))["pass"]:
try:
if args.skip_text:
result = ocrmypdf.ocr(path, path, skip_text=True)
elif args.redo_ocr:
result = ocrmypdf.ocr(path, path, redo_ocr=True)
elif args.force_ocr:
result = ocrmypdf.ocr(path, path, force_ocr=True)
log.debug(result)
except ocrmypdf.exceptions.EncryptedPdfError:
log.info("[%s]: Skipped PDF OCR because it is encrypted", path)
except ocrmypdf.exceptions.DigitalSignatureError:
log.info("[%s]: Skipped PDF because it has a digital signature", path)
except (ocrmypdf.exceptions.TaggedPDFError, ocrmypdf.exceptions.PriorOcrFoundError):
log.info("[%s]: Skipped PDF because it already contained text", path)
except Exception as e:
log.warning("[%s]: Could not run OCR. %s", path, e)

p = Path(path)
output_path = p.parent
output_path /= p.stem + ".OEB"
Expand Down Expand Up @@ -111,7 +136,7 @@ def process_path(args, path):
]

if get_calibre_version() >= (7, 19, 0):
command += ['--pdf-engine', 'pdftohtml']
command += ["--pdf-engine", "pdftohtml"]

if args.simulate:
print(shlex.join(command))
Expand All @@ -120,7 +145,7 @@ def process_path(args, path):
try:
processes.cmd(*command)
except subprocess.CalledProcessError:
log.exception('[%s]: Calibre failed to process book. Skipping...', str(path))
log.exception("[%s]: Calibre failed to process book. Skipping...", str(path))
return path

if not output_path.exists() or path_utils.is_empty_folder(output_path):
Expand Down
36 changes: 36 additions & 0 deletions xklb/utils/arggroups.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse, os, re, textwrap, typing
from pathlib import Path
from shutil import which

from xklb.utils import (
arg_utils,
Expand Down Expand Up @@ -1849,3 +1850,38 @@ def history(parser):
action="store_true",
help="Exclude completely watched media",
)


def ocrmypdf(parent_parser):
parser = parent_parser.add_argument_group("OCRMyPDF")
mode = parser.add_mutually_exclusive_group()
mode.add_argument("--no-ocr", action="store_true", help="Skip OCR")
mode.add_argument(
"--force-ocr",
action="store_true",
help="Rasterize any text or vector objects on each page, apply OCR, and "
"save the rastered output (this rewrites the PDF)",
)
mode.add_argument(
"--skip-text",
action="store_true",
help="Skip OCR on any pages that already contain text, but include the "
"page in final output; useful for PDFs that contain a mix of "
"images, text pages, and/or previously OCRed pages",
)
mode.add_argument(
"--redo-ocr",
action="store_true",
help="Attempt to detect and remove the hidden OCR layer from files that "
"were previously OCRed with OCRmyPDF or another program. Apply OCR "
"to text found in raster images. Existing visible text objects will "
"not be changed. If there is no existing OCR, OCR will be added.",
)


def ocrmypdf_post(args):
if not any([args.no_ocr, args.force_ocr, args.skip_text, args.redo_ocr]):
if which("tesseract") and which("gs"):
args.skip_text = True
else:
args.no_ocr = True
2 changes: 1 addition & 1 deletion xklb/utils/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def url_to_local_path(url, response=None, output_path=None, output_prefix=None):
return output_path


def download_url(args, url, output_path=None, retry_num=0):
def download_url(args, url: str, output_path=None, retry_num=0) -> str | None:
global session
if session is None:
log.warning("Creating new web.session")
Expand Down

0 comments on commit 8f56223

Please sign in to comment.