From 0eaf610cfe6a0a36cfad2fa71b42d1f921b6e49b Mon Sep 17 00:00:00 2001 From: GeorgeFI Date: Wed, 20 Sep 2023 20:53:19 +0200 Subject: [PATCH 1/3] feat: Added pytesseract wrapper --- pyproject.toml | 1 + requirements/dev_requirements.txt | 13 ++++++++++++- requirements/requirements.txt | 5 +++++ requirements/test_requirements.txt | 7 +++++++ src/sec_certs/utils/pdf.py | 14 +++++++++----- 5 files changed, 34 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e26334d6..877802eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ "networkx", "pydantic", "psutil", + "pytesseract", ] [project.optional-dependencies] diff --git a/requirements/dev_requirements.txt b/requirements/dev_requirements.txt index f4dd8b6d..3724f704 100644 --- a/requirements/dev_requirements.txt +++ b/requirements/dev_requirements.txt @@ -73,6 +73,8 @@ docutils==0.17.1 # sphinx entrypoints==0.4 # via jupyter-client +exceptiongroup==1.1.3 + # via pytest executing==1.2.0 # via stack-data fastjsonschema==2.16.2 @@ -97,6 +99,7 @@ importlib-metadata==5.1.0 # via # jupyter-cache # myst-nb + # sphinx iniconfig==1.1.1 # via pytest ipykernel==6.19.1 @@ -222,6 +225,7 @@ packaging==22.0 # matplotlib # pikepdf # pydata-sphinx-theme + # pytesseract # pytest # setuptools-scm # spacy @@ -252,6 +256,7 @@ pillow==9.3.0 # via # matplotlib # pikepdf + # pytesseract # sec-certs (./../pyproject.toml) pip-tools==6.11.0 # via sec-certs (./../pyproject.toml) @@ -304,6 +309,8 @@ pyrsistent==0.19.2 # via jsonschema pysankeybeta==1.4.0 # via sec-certs (./../pyproject.toml) +pytesseract==0.3.10 + # via sec-certs (./../pyproject.toml) pytest==7.2.0 # via # pytest-cov @@ -434,7 +441,9 @@ tomli==2.0.1 # pytest # setuptools-scm tornado==6.3.3 - # via setuptools-scm + # via + # ipykernel + # jupyter-client tqdm==4.64.1 # via # sec-certs (./../pyproject.toml) @@ -464,10 +473,12 @@ types-urllib3==1.26.25.4 # via types-requests typing-extensions==4.4.0 # via + # black # mypy # myst-nb # myst-parser # pydantic + # pypdf # setuptools-scm urllib3==1.26.13 # via requests diff --git a/requirements/requirements.txt b/requirements/requirements.txt index b0b5ca88..a875a3b5 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -128,6 +128,7 @@ packaging==22.0 # ipykernel # matplotlib # pikepdf + # pytesseract # setuptools-scm # spacy pandas==1.5.2 @@ -152,6 +153,7 @@ pillow==9.3.0 # via # matplotlib # pikepdf + # pytesseract # sec-certs (./../pyproject.toml) pkgconfig==1.5.5 # via sec-certs (./../pyproject.toml) @@ -189,6 +191,8 @@ pyrsistent==0.19.2 # via jsonschema pysankeybeta==1.4.0 # via sec-certs (./../pyproject.toml) +pytesseract==0.3.10 + # via sec-certs (./../pyproject.toml) python-dateutil==2.8.2 # via # jupyter-client @@ -275,6 +279,7 @@ typer==0.7.0 typing-extensions==4.4.0 # via # pydantic + # pypdf # setuptools-scm urllib3==1.26.13 # via requests diff --git a/requirements/test_requirements.txt b/requirements/test_requirements.txt index 2b92e95f..45c9adb6 100644 --- a/requirements/test_requirements.txt +++ b/requirements/test_requirements.txt @@ -52,6 +52,8 @@ distro==1.8.0 # via tabula-py entrypoints==0.4 # via jupyter-client +exceptiongroup==1.1.3 + # via pytest executing==1.2.0 # via stack-data fonttools==4.38.0 @@ -136,6 +138,7 @@ packaging==22.0 # ipykernel # matplotlib # pikepdf + # pytesseract # pytest # setuptools-scm # spacy @@ -161,6 +164,7 @@ pillow==9.3.0 # via # matplotlib # pikepdf + # pytesseract # sec-certs (./../pyproject.toml) pkgconfig==1.5.5 # via sec-certs (./../pyproject.toml) @@ -200,6 +204,8 @@ pyrsistent==0.19.2 # via jsonschema pysankeybeta==1.4.0 # via sec-certs (./../pyproject.toml) +pytesseract==0.3.10 + # via sec-certs (./../pyproject.toml) pytest==7.2.0 # via # pytest-cov @@ -295,6 +301,7 @@ typer==0.7.0 typing-extensions==4.4.0 # via # pydantic + # pypdf # setuptools-scm urllib3==1.26.13 # via requests diff --git a/src/sec_certs/utils/pdf.py b/src/sec_certs/utils/pdf.py index 749a8a5a..b4ec1e94 100644 --- a/src/sec_certs/utils/pdf.py +++ b/src/sec_certs/utils/pdf.py @@ -11,6 +11,7 @@ import pdftotext import pikepdf +import pytesseract from sec_certs import constants from sec_certs.constants import ( @@ -51,13 +52,16 @@ def ocr_pdf_file(pdf_path: Path) -> str: ) if ppm.returncode != 0: raise ValueError(f"pdftoppm failed: {ppm.returncode}") + for ppm_path in map(Path, glob.glob(str(tmppath / "image*.ppm"))): base = ppm_path.with_suffix("") - tes = subprocess.run( - ["tesseract", "-l", "eng+deu+fra", ppm_path, base], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL - ) - if tes.returncode != 0: - raise ValueError(f"tesseract failed: {tes.returncode}") + content = pytesseract.image_to_string(ppm_path, lang="eng+deu+fra") + + if content: + with Path(base.with_suffix(".txt")).open("w") as file: + file.write(content) + else: + raise ValueError(f"OCR failed for document {ppm_path}. Check document manually") contents = "" From 498308a737943b87ecbf38a05404d4caa22fa726 Mon Sep 17 00:00:00 2001 From: GeorgeFI Date: Mon, 25 Sep 2023 21:12:26 +0200 Subject: [PATCH 2/3] fix: OCR with Pytesseract fixed now --- src/sec_certs/utils/pdf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sec_certs/utils/pdf.py b/src/sec_certs/utils/pdf.py index b4ec1e94..7736b5d0 100644 --- a/src/sec_certs/utils/pdf.py +++ b/src/sec_certs/utils/pdf.py @@ -12,6 +12,7 @@ import pdftotext import pikepdf import pytesseract +from PIL import Image from sec_certs import constants from sec_certs.constants import ( @@ -55,7 +56,7 @@ def ocr_pdf_file(pdf_path: Path) -> str: for ppm_path in map(Path, glob.glob(str(tmppath / "image*.ppm"))): base = ppm_path.with_suffix("") - content = pytesseract.image_to_string(ppm_path, lang="eng+deu+fra") + content = pytesseract.image_to_string(Image.open(ppm_path), lang="eng+deu+fra") if content: with Path(base.with_suffix(".txt")).open("w") as file: From 779c19ecca9612861b8d5694c6d8a0c3e27036cb Mon Sep 17 00:00:00 2001 From: GeorgeFI Date: Sun, 1 Oct 2023 14:26:05 +0200 Subject: [PATCH 3/3] fix: Fixed calling method on NoneType --- src/sec_certs/sample/cc_scheme.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sec_certs/sample/cc_scheme.py b/src/sec_certs/sample/cc_scheme.py index 307dfb57..bc0c0792 100644 --- a/src/sec_certs/sample/cc_scheme.py +++ b/src/sec_certs/sample/cc_scheme.py @@ -107,7 +107,7 @@ def get_australia_in_evaluation(enhanced: bool = True) -> list[dict[str, Any]]: if enhanced: e: dict[str, Any] = {} cert_page = _get_page(cert["url"]) - article = cert_page.find("article", attrs={"role": "article"}) + article = cert_page.find("article") blocks = article.find("div").find_all("div", class_="flex", recursive=False) for h2 in blocks[0].find_all("h2"): val = sns(h2.find_next_sibling("span").text)