Skip to content

Commit

Permalink
Merge branch 'feature/jbig2thresh' into v15
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Sep 21, 2023
2 parents 9b77daa + 330352a commit 0388c23
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 7 deletions.
1 change: 1 addition & 0 deletions misc/completion/ocrmypdf.bash
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ __ocrmypdf_arguments()
--jpeg-quality (JPEG quality [0..100])
--png-quality (PNG quality [0..100])
--jbig2-lossy (enable lossy JBIG2 (see docs))
--jbig2-threshold (set JBIG2 threshold (see docs))
--pages (apply OCR to only the specified pages)
--max-image-mpixels (image decompression bomb threshold)
--pdf-renderer (select PDF renderer options)
Expand Down
1 change: 1 addition & 0 deletions misc/completion/ocrmypdf.fish
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ complete -c ocrmypdf -x -l skip-big -d "skip OCR on pages larger than this many
complete -c ocrmypdf -x -l jpeg-quality -d "JPEG quality [0..100]"
complete -c ocrmypdf -x -l png-quality -d "PNG quality [0..100]"
complete -c ocrmypdf -x -l jbig2-lossy -d "enable lossy JBIG2 (see docs)"
complete -c ocrmypdf -x -l jbig2-threshold -d "JBIG2 compression threshold (see docs)"
complete -c ocrmypdf -x -l max-image-mpixels -d "image decompression bomb threshold"
complete -c ocrmypdf -x -l pages -d "apply OCR to only the specified pages"
complete -c ocrmypdf -x -l tesseract-config -d "set custom tesseract config file"
Expand Down
20 changes: 13 additions & 7 deletions src/ocrmypdf/_exec/jbig2enc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,17 @@ def available():
return True


def convert_group(*, cwd, infiles, out_prefix):
def convert_group(*, cwd, infiles, out_prefix, threshold):
args = [
'jbig2',
'-b',
out_prefix,
'-s', # symbol mode (lossy)
'--symbol-mode', # symbol mode (lossy)
'-t',
str(threshold), # threshold
# '-r', # refinement mode (lossless symbol mode, currently disabled in
# jbig2)
'-p',
'--pdf',
]
args.extend(infiles)
proc = run(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
Expand All @@ -40,16 +42,20 @@ def convert_group(*, cwd, infiles, out_prefix):


def convert_group_mp(args):
return convert_group(cwd=args[0], infiles=args[1], out_prefix=args[2])
return convert_group(
cwd=args[0], infiles=args[1], out_prefix=args[2], threshold=args[3]
)


def convert_single(*, cwd, infile, outfile):
args = ['jbig2', '-p', infile]
def convert_single(*, cwd, infile, outfile, threshold):
args = ['jbig2', '--pdf', '-t', str(threshold), infile]
with open(outfile, 'wb') as fstdout:
proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
proc.check_returncode()
return proc


def convert_single_mp(args):
return convert_single(cwd=args[0], infile=args[1], outfile=args[2])
return convert_single(
cwd=args[0], infile=args[1], outfile=args[2], threshold=args[3]
)
10 changes: 10 additions & 0 deletions src/ocrmypdf/builtin_plugins/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,16 @@ def add_options(parser):
# Adjust number of pages to consider at once for JBIG2 compression
help=argparse.SUPPRESS,
)
optimizing.add_argument(
'--jbig2-threshold',
type=numeric(float, 0.4, 0.9),
default=0.85,
metavar='T',
help=(
"Adjust JBIG2 symbol code classification threshold "
"(default 0.85), range 0.4 to 0.9."
),
)


@hookimpl
Expand Down
3 changes: 3 additions & 0 deletions src/ocrmypdf/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ def jbig2_group_args(root: Path, groups: dict[int, list[XrefExt]]):
fspath(root), # =cwd
(img_name(root, xref, ext) for xref, ext in xref_exts), # =infiles
prefix, # =out_prefix
options.jbig2_threshold,
)

def jbig2_single_args(root, groups: dict[int, list[XrefExt]]):
Expand All @@ -379,6 +380,7 @@ def jbig2_single_args(root, groups: dict[int, list[XrefExt]]):
fspath(root),
img_name(root, xref, ext),
root / f'{prefix}.{n:04d}',
options.jbig2_threshold,
)

if options.jbig2_page_group_size > 1:
Expand Down Expand Up @@ -737,6 +739,7 @@ def __init__(
self.png_quality = png_quality
self.jbig2_page_group_size = 0
self.jbig2_lossy = jb2lossy
self.jbig2_threshold = 0.85
self.quiet = True
self.progress_bar = False

Expand Down
2 changes: 2 additions & 0 deletions tests/test_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def test_jbig2_lossy(lossy, resources, outpdf):
'20',
'--plugin',
'tests/plugins/tesseract_noop.py',
'--jbig2-threshold',
'0.7',
]
if lossy:
args.append('--jbig2-lossy')
Expand Down

0 comments on commit 0388c23

Please sign in to comment.