Skip to content

Commit

Permalink
More Tesseract-specific language checks to its plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Jun 1, 2024
1 parent 653c4ff commit 59f6bc8
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 13 deletions.
11 changes: 0 additions & 11 deletions src/ocrmypdf/_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,6 @@ def check_options_languages(
system_lang = locale.getlocale()[0]
if system_lang and not system_lang.startswith('en'):
log.debug("No language specified; assuming --language %s", DEFAULT_LANGUAGE)
DENIED_LANGUAGES = {'equ', 'osd'}
if DENIED_LANGUAGES & set(options.languages):
log.warning(
"The following languages for Tesseract's internal use and should not "
"be issued explicitly: "
f"{', '.join(DENIED_LANGUAGES)}\n"
"OCRmyPDF will ignore them."
)
options.languages = [
lang for lang in options.languages if lang not in DENIED_LANGUAGES
]
if not ocr_engine_languages:
return

Expand Down
9 changes: 9 additions & 0 deletions src/ocrmypdf/builtin_plugins/tesseract_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ocrmypdf._exec import tesseract
from ocrmypdf._jobcontext import PageContext
from ocrmypdf.cli import numeric, str_to_int
from ocrmypdf.exceptions import BadArgsError
from ocrmypdf.helpers import clamp
from ocrmypdf.imageops import calculate_downsample, downsample_image
from ocrmypdf.pluginspec import OcrEngine
Expand Down Expand Up @@ -164,6 +165,14 @@ def check_options(options):
"The --tesseract-pagesegmode argument you select will disable OCR. "
"This may cause processing to fail."
)
DENIED_LANGUAGES = {'equ', 'osd'}
if DENIED_LANGUAGES & set(options.languages):
raise BadArgsError(
"The following languages for Tesseract's internal use and should not "
"be issued explicitly: "
f"{', '.join(DENIED_LANGUAGES & set(options.languages))}\n"
"Remove them from the -l/--language argument."
)


@hookimpl
Expand Down
11 changes: 9 additions & 2 deletions tests/test_tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@

from ocrmypdf import pdfinfo
from ocrmypdf._exec import tesseract
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError

from .conftest import check_ocrmypdf
from .conftest import check_ocrmypdf, run_ocrmypdf_api

# pylint: disable=redefined-outer-name

Expand Down Expand Up @@ -144,3 +144,10 @@ def test_tesseract_log_output_raises(caplog):
with pytest.raises(tesseract.TesseractConfigError):
tesseract.tesseract_log_output(b'parameter not found: moo')
assert 'not found' in caplog.text


def test_blocked_language(resources, no_outpdf):
infile = resources / 'masks.pdf'
for bad_lang in ['osd', 'equ']:
with pytest.raises(BadArgsError):
run_ocrmypdf_api(infile, no_outpdf, '-l', bad_lang)

0 comments on commit 59f6bc8

Please sign in to comment.