diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b1ad2a31..ca2a9397 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,13 +24,17 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ env.PYTHON_VERSION }} + - name: Install Poppler + run: | + sudo apt-get update + sudo apt-get -y install poppler-utils - name: Setup virtual environment (no cache hit) if: steps.virtualenv-cache.outputs.cache-hit != 'true' run: | python${{ env.PYTHON_VERSION }} -m venv .venv source .venv/bin/activate make install-ci - + lint: runs-on: ubuntu-latest needs: setup @@ -80,6 +84,10 @@ jobs: python${{ env.PYTHON_VERSION }} -m venv .venv source .venv/bin/activate make install-ci + - name: Install Poppler + run: | + sudo apt-get update + sudo apt-get -y install poppler-utils tesseract-ocr - name: Test run: | source .venv/bin/activate diff --git a/CHANGELOG.md b/CHANGELOG.md index 66cb4e5c..e8a3ed08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.5 + +* Add YoloX model for images and PDFs + ## 0.2.5-dev0 * Add generic model interface diff --git a/Makefile b/Makefile index cea2b31c..7ff1cb74 100644 --- a/Makefile +++ b/Makefile @@ -29,7 +29,7 @@ install-base-pip-packages: .PHONY: install-detectron2 install-detectron2: - pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2" + pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@78d5b4f335005091fe0364ce4775d711ec93566e" .PHONY: install-test install-test: diff --git a/README.md b/README.md index f5bb5776..b690432a 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,40 @@ If you are using an Apple M1 chip, use `make run-app-dev` instead of `make start start the API with hot reloading. The API will run at `http:/localhost:8000`. View the swagger documentation at `http://localhost:5000/docs`. + +## YoloX model + +For using the YoloX model the endpoints are: +``` +http://localhost:8000/layout_v1/pdf +http://localhost:8000/layout_v1/image +``` +For example: +``` +curl -X 'POST' 'http://localhost:8000/layout/yolox/image' \ +-F 'file=@sample-docs/test-image.jpg' \ + | jq -C | less -R + +curl -X 'POST' 'http://localhost:8000/layout/yolox/pdf' \ +-F 'file=@sample-docs/loremipsum.pdf' \ + | jq -C | less -R +``` + +If your PDF file doesn't have text embedded you can force the use of OCR with +the parameter force_ocr=True: +``` +curl -X 'POST' 'http://localhost:8000/layout/yolox/pdf' \ +-F 'file=@sample-docs/loremipsum.pdf' \ +-F force_ocr=true + | jq -C | less -R +``` + +or in local: + +``` +layout = yolox_local_inference(filename, type="pdf") +``` + ## Security Policy See our [security policy](https://github.com/Unstructured-IO/unstructured-inference/security/policy) for diff --git a/requirements/base.txt b/requirements/base.txt index 9c47938c..1e47d983 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -18,6 +18,8 @@ charset-normalizer==3.0.1 # requests click==8.1.3 # via uvicorn +coloredlogs==15.0.1 + # via onnxruntime contourpy==1.0.7 # via matplotlib cryptography==39.0.0 @@ -30,6 +32,8 @@ fastapi==0.89.1 # via unstructured-inference (setup.py) filelock==3.9.0 # via huggingface-hub +flatbuffers==23.1.21 + # via onnxruntime fonttools==4.38.0 # via matplotlib h11==0.14.0 @@ -38,23 +42,30 @@ huggingface-hub==0.12.0 # via # timm # unstructured-inference (setup.py) +humanfriendly==10.0 + # via coloredlogs idna==3.4 # via # anyio # requests iopath==0.1.10 # via layoutparser +jsons==1.6.3 + # via unstructured-inference (setup.py) kiwisolver==1.4.4 # via matplotlib layoutparser[layoutmodels,tesseract]==0.3.4 # via unstructured-inference (setup.py) matplotlib==3.6.3 # via pycocotools +mpmath==1.2.1 + # via sympy numpy==1.24.1 # via # contourpy # layoutparser # matplotlib + # onnxruntime # opencv-python # pandas # pycocotools @@ -62,6 +73,8 @@ numpy==1.24.1 # torchvision omegaconf==2.3.0 # via effdet +onnxruntime==1.13.1 + # via unstructured-inference (setup.py) opencv-python==4.6.0.66 # via # layoutparser @@ -70,6 +83,7 @@ packaging==23.0 # via # huggingface-hub # matplotlib + # onnxruntime # pytesseract pandas==1.5.3 # via layoutparser @@ -89,6 +103,8 @@ pillow==9.4.0 # torchvision portalocker==2.7.0 # via iopath +protobuf==4.21.12 + # via onnxruntime pycocotools==2.0.6 # via effdet pycparser==2.21 @@ -127,6 +143,8 @@ sniffio==1.3.0 # via anyio starlette==0.22.0 # via fastapi +sympy==1.11.1 + # via onnxruntime timm==0.6.12 # via effdet torch==1.13.1 @@ -152,6 +170,8 @@ typing-extensions==4.4.0 # starlette # torch # torchvision +typish==1.9.3 + # via jsons urllib3==1.26.14 # via requests uvicorn==0.20.0 diff --git a/requirements/dev.txt b/requirements/dev.txt index 955df12c..ccf46f01 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -25,7 +25,7 @@ attrs==22.2.0 # via jsonschema backcall==0.2.0 # via ipython -beautifulsoup4==4.11.1 +beautifulsoup4==4.11.2 # via nbconvert bleach==6.0.0 # via nbconvert @@ -59,7 +59,7 @@ importlib-metadata==6.0.0 # nbconvert importlib-resources==5.10.2 # via jsonschema -ipykernel==6.21.0 +ipykernel==6.20.2 # via # ipywidgets # jupyter @@ -111,7 +111,6 @@ jupyter-console==6.4.4 # via jupyter jupyter-core==5.2.0 # via - # ipykernel # jupyter-client # jupyter-server # nbclassic @@ -161,6 +160,7 @@ nbformat==5.7.3 # notebook nest-asyncio==1.5.6 # via + # ipykernel # nbclassic # notebook notebook==6.5.2 @@ -182,7 +182,7 @@ pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pip-tools==6.12.1 +pip-tools==6.12.2 # via -r requirements/dev.in pkgutil-resolve-name==1.3.10 # via jsonschema diff --git a/requirements/test.in b/requirements/test.in index d938a877..f57d2de7 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -10,5 +10,7 @@ httpx flake8 mypy pytest-cov +pdf2image>=1.16.2 +huggingface_hub>=0.11.1 label_studio_sdk vcrpy diff --git a/requirements/test.txt b/requirements/test.txt index fab3639d..c4459358 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -27,6 +27,8 @@ coverage[toml]==7.1.0 # pytest-cov exceptiongroup==1.1.0 # via pytest +filelock==3.9.0 + # via huggingface-hub flake8==6.0.0 # via -r requirements/test.in h11==0.14.0 @@ -35,6 +37,8 @@ httpcore==0.16.3 # via httpx httpx==0.23.3 # via -r requirements/test.in +huggingface-hub==0.12.0 + # via -r requirements/test.in idna==3.4 # via # anyio @@ -58,9 +62,15 @@ mypy-extensions==0.4.3 # black # mypy packaging==23.0 - # via pytest + # via + # huggingface-hub + # pytest pathspec==0.11.0 # via black +pdf2image==1.16.2 + # via -r requirements/test.in +pillow==9.4.0 + # via pdf2image platformdirs==2.6.2 # via black pluggy==1.0.0 @@ -76,9 +86,13 @@ pytest==7.2.1 pytest-cov==4.0.0 # via -r requirements/test.in pyyaml==6.0 - # via vcrpy + # via + # huggingface-hub + # vcrpy requests==2.28.2 - # via label-studio-sdk + # via + # huggingface-hub + # label-studio-sdk rfc3986[idna2008]==1.5.0 # via httpx six==1.16.0 @@ -94,9 +108,12 @@ tomli==2.0.1 # coverage # mypy # pytest +tqdm==4.64.1 + # via huggingface-hub typing-extensions==4.4.0 # via # black + # huggingface-hub # mypy # pydantic urllib3==1.26.14 diff --git a/sample-docs/empty-document.pdf b/sample-docs/empty-document.pdf new file mode 100644 index 00000000..393a3b39 Binary files /dev/null and b/sample-docs/empty-document.pdf differ diff --git a/sample-docs/non-embedded.pdf b/sample-docs/non-embedded.pdf new file mode 100644 index 00000000..427fecaf Binary files /dev/null and b/sample-docs/non-embedded.pdf differ diff --git a/sample-docs/test-image.jpg b/sample-docs/test-image.jpg new file mode 100644 index 00000000..328614bd Binary files /dev/null and b/sample-docs/test-image.jpg differ diff --git a/setup.py b/setup.py index fbf405dd..719a718d 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,8 @@ # on RHEL7. We can remove this pin once the following issue from 12/2022 is resolved # ref: https://github.com/opencv/opencv-python/issues/772 "opencv-python==4.6.0.66", + "onnxruntime", + "jsons" ], extras_require={}, ) diff --git a/test_unstructured_inference/models/test_tesseract.py b/test_unstructured_inference/models/test_tesseract.py index 43d92e4c..b5babe4b 100644 --- a/test_unstructured_inference/models/test_tesseract.py +++ b/test_unstructured_inference/models/test_tesseract.py @@ -11,6 +11,7 @@ def __init__(self, languages): def test_load_agent(monkeypatch): monkeypatch.setattr(tesseract, "TesseractAgent", MockTesseractAgent) + monkeypatch.setattr(tesseract, "ocr_agent", None) with patch.object(tesseract, "is_pytesseract_available", return_value=True): tesseract.load_agent() diff --git a/test_unstructured_inference/test_api.py b/test_unstructured_inference/test_api.py index f7efe995..a91db344 100644 --- a/test_unstructured_inference/test_api.py +++ b/test_unstructured_inference/test_api.py @@ -1,11 +1,14 @@ -import pytest import os +import shutil +import jsons +import pytest from fastapi.testclient import TestClient from unstructured_inference import api from unstructured_inference.models import base as models from unstructured_inference.inference.layout import DocumentLayout +from unstructured_inference.models.yolox import yolox_local_inference # DocumentLayout #maybe class MockModel: @@ -40,7 +43,9 @@ def test_layout_parsing_api(monkeypatch, filetype, ext, data, process_func, expe client = TestClient(api.app) response = client.post( - f"/layout/{filetype}", files={"file": (filename, open(filename, "rb"))}, data=data + f"/layout/detectron/{filetype}", + files={"file": (filename, open(filename, "rb"))}, + data=data, ) assert response.status_code == expected_response_code @@ -52,6 +57,90 @@ def test_bad_route_404(): assert response.status_code == 404 +def test_layout_v02_api_parsing_image(): + + filename = os.path.join("sample-docs", "test-image.jpg") + + client = TestClient(api.app) + response = client.post( + "/layout/yolox/image", + headers={"Accept": "multipart/mixed"}, + files=[("file", (filename, open(filename, "rb"), "image/png"))], + ) + doc_layout = jsons.load(response.json(), DocumentLayout) + assert len(doc_layout.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 13 detections + assert len(doc_layout.pages[0]["layout"]) == 13 + assert response.status_code == 200 + + +def test_layout_v02_api_parsing_pdf(): + + filename = os.path.join("sample-docs", "loremipsum.pdf") + + client = TestClient(api.app) + response = client.post( + "/layout/yolox/pdf", + files={"file": (filename, open(filename, "rb"))}, + ) + doc_layout = jsons.load(response.json(), DocumentLayout) + assert len(doc_layout.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 5 detections + assert len(doc_layout.pages[0]["layout"]) == 5 + assert response.status_code == 200 + + +def test_layout_v02_api_parsing_pdf_ocr(): + + filename = os.path.join("sample-docs", "non-embedded.pdf") + + client = TestClient(api.app) + response = client.post( + "/layout/yolox/pdf", + files={"file": (filename, open(filename, "rb"))}, + data={"force_ocr": True}, + ) + doc_layout = jsons.load(response.json(), DocumentLayout) + assert len(doc_layout.pages) == 10 + assert len(doc_layout.pages[0]["layout"]) > 1 + assert response.status_code == 200 + + +def test_layout_v02_local_parsing_image(): + filename = os.path.join("sample-docs", "test-image.jpg") + OUTPUT_DIR = "yolox_output" + # NOTE(benjamin) keep_output = True create a file for each image in + # localstorage for visualization of the result + if os.path.exists(OUTPUT_DIR): + # NOTE(benjamin): should delete the default output folder on test? + shutil.rmtree(OUTPUT_DIR) + document_layout_1 = yolox_local_inference(filename, type="image", output_directory=OUTPUT_DIR) + assert len(document_layout_1.pages) == 1 + document_layout_2 = yolox_local_inference(filename, type="image") + # NOTE(benjamin) The example image should result in one page result + assert len(document_layout_2.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 13 detections + assert len(document_layout_2.pages[0].layout) == 13 + + +def test_layout_v02_local_parsing_pdf(): + filename = os.path.join("sample-docs", "loremipsum.pdf") + document_layout = yolox_local_inference(filename, type="pdf") + content = document_layout.to_string() + assert "Lorem ipsum" in content + assert len(document_layout.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 5 detections + assert len(document_layout.pages[0].layout) == 5 + + +def test_layout_v02_local_parsing_empty_pdf(): + filename = os.path.join("sample-docs", "empty-document.pdf") + document_layout = yolox_local_inference(filename, type="pdf") + assert len(document_layout.pages) == 1 + # NOTE(benjamin) The example sent to the test contains 5 detections + assert len(document_layout.pages[0].layout) == 0 + + def test_healthcheck(monkeypatch): client = TestClient(api.app) response = client.get("/healthcheck") diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 873052c4..ebcc9adb 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.5-dev0" # pragma: no cover +__version__ = "0.2.5" # pragma: no cover diff --git a/unstructured_inference/api.py b/unstructured_inference/api.py index 0a207214..ad5b0b91 100644 --- a/unstructured_inference/api.py +++ b/unstructured_inference/api.py @@ -2,6 +2,9 @@ from unstructured_inference.inference.layout import process_data_with_model from unstructured_inference.models.base import UnknownModelException from typing import List +import tempfile + +from unstructured_inference.models.yolox import yolox_local_inference app = FastAPI() @@ -9,7 +12,7 @@ VALID_FILETYPES = ["pdf", "image"] -@app.post("/layout/{filetype:path}") +@app.post("/layout/detectron/{filetype:path}") async def layout_parsing( filetype: str, file: UploadFile = File(), @@ -38,6 +41,21 @@ async def layout_parsing( return {"pages": pages_layout} +@app.post("/layout/yolox/{filetype:path}") +async def layout_parsing_yolox( + filetype: str, + request: Request, + file: List[UploadFile] = File(default=None), + force_ocr=Form(default=False), +): + + with tempfile.NamedTemporaryFile() as tmp_file: + tmp_file.write(file[0].file.read()) + detections = yolox_local_inference(tmp_file.name, type=filetype, use_ocr=force_ocr) + + return detections + + @app.get("/healthcheck", status_code=status.HTTP_200_OK) async def healthcheck(request: Request): return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 753cb8c6..6bf5ee19 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -16,13 +16,15 @@ import unstructured_inference.models.tesseract as tesseract from unstructured_inference.models.base import get_model from unstructured_inference.models.unstructuredmodel import UnstructuredModel +import cv2 @dataclass class LayoutElement: type: str - # NOTE(robinson) - Coordinates are reported starting from the upper left and - # proceeding clockwise + # NOTE(robinson) - The list contain two elements, each a tuple + # in format (x1,y1), the first the upper left corner and the second + # the right bottom corner coordinates: List[Tuple[float, float]] text: Optional[str] = None @@ -32,18 +34,30 @@ def __str__(self): def to_dict(self): return self.__dict__ + def get_width(self): + # NOTE(benjamin) i.e: y2-y1 + return self.coordinates[1][0] - self.coordinates[0][0] + + def get_height(self): + # NOTE(benjamin) i.e: x2-x1 + return self.coordinates[1][1] - self.coordinates[0][1] + class DocumentLayout: """Class for handling documents that are saved as .pdf files. For .pdf files, a document image analysis (DIA) model detects the layout of the page prior to extracting element.""" - def __init__(self): - self._pages = None + def __init__(self, pages=None): + self._pages = pages def __str__(self) -> str: return "\n\n".join([str(page) for page in self.pages]) + def to_string(self): + # Temporary method, this should replace __str__ + return "\n\n".join([element.to_string() for element in self.pages]) + @property def pages(self) -> List[PageLayout]: """Gets all elements from pages in sequential order.""" @@ -65,6 +79,7 @@ def from_file(cls, filename: str, model: Optional[Detectron2LayoutModel] = None) # image and returns a dict, or something. logger.info(f"Reading PDF for file: {filename} ...") layouts, images = load_pdf(filename, load_images=True) + layouts, images = load_pdf(filename, load_images=True) pages: List[PageLayout] = list() for i, layout in enumerate(layouts): image = images[i] @@ -90,6 +105,69 @@ def from_image_file(cls, filename: str, model: Optional[Detectron2LayoutModel] = page.get_elements() return cls.from_pages([page]) + def parse_elements(self, pdf_filename, DPI=500): + """ + Fill the text of the document from embedded file + """ + with tempfile.TemporaryDirectory() as tmp_folder: + + for n_page, page in enumerate(self._pages): + new_layout = [] + for n_element, element in enumerate(page.layout): + + (upper_left_x, upper_left_y) = element.coordinates[0] + dest_file = os.path.join(tmp_folder, f"{n_page}-{n_element}.txt") + + cmd = ( + f"pdftotext -r {DPI} -x {int(upper_left_x)} -y {int(upper_left_y)} " + + f"-W {int(element.get_width())} -H {int(element.get_height())} " + + f"-f {page.number} -l {page.number} {pdf_filename} {dest_file}" + ) + + exit = os.system(cmd) + + if exit == 0: + with open(dest_file) as file: + content = file.read() + element.text = content + new_layout.append(element) + else: + continue + new_page = PageLayout(number=page.number, image=None, layout=new_layout) + self._pages[n_page] = new_page + + def parse_image_elements(self, filename, num, DPI=500): + """ + Fill the text of the document from OCR + """ + with tempfile.TemporaryDirectory() as tmp_folder: + n_page = num + page = self._pages[n_page] + + new_layout = [] + for n_element, element in enumerate(page.layout): + + (upper_left_x, upper_left_y) = element.coordinates[0] + upper_left_x = int(upper_left_x) + upper_left_y = int(upper_left_y) + width = upper_left_x + int(element.get_width()) + height = upper_left_y + int(element.get_height()) + dest_file = os.path.join(tmp_folder, f"{n_page}-{n_element}.jpg") + + image = cv2.imread(filename) + patch = image[upper_left_y:height, upper_left_x:width] + cv2.imwrite(dest_file, patch) + # Enabling this makes test_load_agent fails + if not tesseract.ocr_agent: + tesseract.load_agent() + text = tesseract.ocr_agent.detect(patch) + + element.text = text + new_layout.append(element) + + new_page = PageLayout(number=page.number, image=None, layout=new_layout) + self._pages[n_page] = new_page + class PageLayout: """Class for an individual PDF page.""" @@ -111,6 +189,10 @@ def __init__( def __str__(self): return "\n\n".join([str(element) for element in self.elements]) + def to_string(self): + """Temporary method, should replace __str__""" + return "\n\n".join([str(element) for element in self.layout]) + def get_elements(self, inplace=True) -> Optional[List[LayoutElement]]: """Uses specified model to detect the elements on the page.""" logger.info("Detecting page elements ...") diff --git a/unstructured_inference/models/__init__.py b/unstructured_inference/models/__init__.py index e69de29b..ade0a245 100644 --- a/unstructured_inference/models/__init__.py +++ b/unstructured_inference/models/__init__.py @@ -0,0 +1,46 @@ +from typing import Tuple, Dict +from huggingface_hub import hf_hub_download + + +def _get_model_loading_info(model: str) -> Tuple[str, str, Dict[int, str]]: + """Gets local model binary and config locations and label map, downloading if necessary.""" + # TODO(alan): Find the right way to map model name to retrieval. It seems off that testing + # needs to mock hf_hub_download. + if model == "checkbox": + repo_id = "unstructuredio/oer-checkbox" + binary_fn = "detectron2_finetuned_oer_checkbox.pth" + config_fn = "detectron2_oer_checkbox.json" + model_path = hf_hub_download(repo_id, binary_fn) + config_path = hf_hub_download(repo_id, config_fn) + label_map = {0: "Unchecked", 1: "Checked"} + elif model == "yolox": + # NOTE(benjamin) Repository and file to download from hugging_face + repo_id = "unstructuredio/yolo_x_layout" + binary_fn = "yolox_l0.05.onnx" + model_path = hf_hub_download(repo_id, binary_fn) + label_map = { + 0: "Caption", + 1: "Footnote", + 2: "Formula", + 3: "List-item", + 4: "Page-footer", + 5: "Page-header", + 6: "Picture", + 7: "Section-header", + 8: "Table", + 9: "Text", + 10: "Title", + } + # NOTE(benjamin): just for acomplish with previous version of this function + config_path = None + + else: + raise UnknownModelException(f"Unknown model type: {model}") + # NOTE(benjamin): Maybe could return a dictionary intead this set of variables + return model_path, config_path, label_map + + +class UnknownModelException(Exception): + """Exception for the case where a model is called for with an unrecognized identifier.""" + + pass diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py new file mode 100644 index 00000000..1aaa61bd --- /dev/null +++ b/unstructured_inference/models/yolox.py @@ -0,0 +1,270 @@ +# Copyright (c) Megvii, Inc. and its affiliates. +# Unstructured modified the original source code found at: +# https://github.com/Megvii-BaseDetection/YOLOX/blob/237e943ac64aa32eb32f875faa93ebb18512d41d/yolox/data/data_augment.py +# https://github.com/Megvii-BaseDetection/YOLOX/blob/ac379df3c97d1835ebd319afad0c031c36d03f36/yolox/utils/demo_utils.py +import os +import tempfile +from typing import Optional +from PIL import Image + +import cv2 +import numpy as np +import onnxruntime +from pdf2image import convert_from_path + +from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout +from unstructured_inference.models import _get_model_loading_info +from unstructured_inference.visualize import draw_bounding_boxes + + +def yolox_local_inference( + filename: str, + type: str = "image", + use_ocr=False, + output_directory: Optional[str] = None, +) -> DocumentLayout: + """This function creates a DocumentLayout from a file in local storage. + Parameters + ---------- + type + Accepted "image" and "pdf" files + use_ocr: + For pdf without embedded text, this function will use OCR for + text extraction + output_directory + Default 'None', if specified, the output of YoloX model will be + drawed over page images at this folder + """ + DPI = 500 + pages_paths = [] + detections = [] + detectedDocument = None + if type == "pdf": + with tempfile.TemporaryDirectory() as tmp_folder: + pages_paths = convert_from_path( + filename, dpi=DPI, output_folder=tmp_folder, paths_only=True + ) + for i, path in enumerate(pages_paths): + # Return a dict of {n-->PageLayoutDocument} + detections.append( + image_processing(path, page_number=i, output_directory=output_directory) + ) + detectedDocument = DocumentLayout(detections) + if use_ocr: + for n, page_path in enumerate(pages_paths): + detectedDocument.parse_image_elements(page_path, n) + else: + # Extract embedded text from PDF + detectedDocument.parse_elements(filename, DPI=DPI) + else: + # Return a PageLayoutDocument + detections = [ + image_processing( + filename, origin_img=None, page_number=0, output_directory=output_directory + ) + ] + detectedDocument = DocumentLayout(detections) + detectedDocument.parse_image_elements(filename, 0) + + return detectedDocument + + +def image_processing( + page: str, + origin_img: Image = None, + page_number: int = 0, + output_directory: Optional[str] = None, +) -> PageLayout: + """Method runing YoloX for layout detection, returns a PageLayout + parameters + ---------- + page + Path for image file with the image to process + origin_img + If specified, an Image object for process with YoloX model + page_number + Number asigned to the PageLayout returned + output_directory + Boolean indicating if result will be stored + """ + # The model was trained and exported with this shape + # TODO (benjamin): check other shapes for inference + input_shape = (1024, 768) + if origin_img and page: + raise ValueError("Just one of the arguments allowed 'page' or 'origin_img'") + if not origin_img: + origin_img = cv2.imread(page) + img, ratio = preprocess(origin_img, input_shape) + page_orig = page + # TODO (benjamin): We should use models.get_model() but currenly returns Detectron model + model_path, _, LAYOUT_CLASSES = _get_model_loading_info("yolox") + session = onnxruntime.InferenceSession(model_path) + + ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]} + output = session.run(None, ort_inputs) + predictions = demo_postprocess(output[0], input_shape, p6=False)[ + 0 + ] # TODO(benjamin): check for p6 + + boxes = predictions[:, :4] + scores = predictions[:, 4:5] * predictions[:, 5:] + + boxes_xyxy = np.ones_like(boxes) + boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0 + boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0 + boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0 + boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0 + boxes_xyxy /= ratio + dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1) + + # If dets is None, the page is created empty, else this object will be replaced + page_layout = PageLayout(number=page_number, image=None, layout=[]) + + if dets is not None: + final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5] + annotated_image = draw_bounding_boxes( + origin_img, + final_boxes, + final_scores, + final_cls_inds, + conf=0.3, + class_names=LAYOUT_CLASSES, + ) + + elements = [] + # Each detection should have (x1,y1,x2,y2,probability,class) format + # being (x1,y1) the top left and (x2,y2) the bottom right + + for det in dets: + detection = det.tolist() + detection[-1] = LAYOUT_CLASSES[int(detection[-1])] + element = LayoutElement( + type=detection[-1], + coordinates=[(detection[0], detection[1]), (detection[2], detection[3])], + text=" ", + ) + + elements.append(element) + + elements.sort(key=lambda element: element.coordinates[0][1]) + + page_layout = PageLayout( + number=page_number, image=None, layout=elements + ) # TODO(benjamin): encode image as base64? + if output_directory: + if not os.path.exists(output_directory): + os.makedirs(output_directory) + # the tmp_file laks of extension + output_path = os.path.join(output_directory, os.path.basename(page_orig)) + cv2.imwrite(output_path, annotated_image) + + return page_layout + + +# Note: preprocess function was named preproc on original source + + +def preprocess(img, input_size, swap=(2, 0, 1)): + if len(img.shape) == 3: + padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114 + else: + padded_img = np.ones(input_size, dtype=np.uint8) * 114 + + r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1]) + resized_img = cv2.resize( + img, + (int(img.shape[1] * r), int(img.shape[0] * r)), + interpolation=cv2.INTER_LINEAR, + ).astype(np.uint8) + padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img + + padded_img = padded_img.transpose(swap) + padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) + return padded_img, r + + +def demo_postprocess(outputs, img_size, p6=False): + + grids = [] + expanded_strides = [] + + if not p6: + strides = [8, 16, 32] + else: + strides = [8, 16, 32, 64] + + hsizes = [img_size[0] // stride for stride in strides] + wsizes = [img_size[1] // stride for stride in strides] + + for hsize, wsize, stride in zip(hsizes, wsizes, strides): + xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize)) + grid = np.stack((xv, yv), 2).reshape(1, -1, 2) + grids.append(grid) + shape = grid.shape[:2] + expanded_strides.append(np.full((*shape, 1), stride)) + + grids = np.concatenate(grids, 1) + expanded_strides = np.concatenate(expanded_strides, 1) + outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides + outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides + + return outputs + + +def multiclass_nms(boxes, scores, nms_thr, score_thr, class_agnostic=True): + """Multiclass NMS implemented in Numpy""" + # TODO(benjamin): check for non-class agnostic + # if class_agnostic: + nms_method = multiclass_nms_class_agnostic + # else: + # nms_method = multiclass_nms_class_aware + return nms_method(boxes, scores, nms_thr, score_thr) + + +def multiclass_nms_class_agnostic(boxes, scores, nms_thr, score_thr): + """Multiclass NMS implemented in Numpy. Class-agnostic version.""" + cls_inds = scores.argmax(1) + cls_scores = scores[np.arange(len(cls_inds)), cls_inds] + + valid_score_mask = cls_scores > score_thr + if valid_score_mask.sum() == 0: + return None + valid_scores = cls_scores[valid_score_mask] + valid_boxes = boxes[valid_score_mask] + valid_cls_inds = cls_inds[valid_score_mask] + keep = nms(valid_boxes, valid_scores, nms_thr) + if keep: + dets = np.concatenate( + [valid_boxes[keep], valid_scores[keep, None], valid_cls_inds[keep, None]], 1 + ) + return dets + + +def nms(boxes, scores, nms_thr): + """Single class NMS implemented in Numpy.""" + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= nms_thr)[0] + order = order[inds + 1] + + return keep diff --git a/unstructured_inference/visualize.py b/unstructured_inference/visualize.py new file mode 100644 index 00000000..3de62deb --- /dev/null +++ b/unstructured_inference/visualize.py @@ -0,0 +1,299 @@ +# Copyright (c) Megvii Inc. All rights reserved. +# Unstructured modified the original source code found at +# https://github.com/Megvii-BaseDetection/YOLOX/blob/ac379df3c97d1835ebd319afad0c031c36d03f36/yolox/utils/visualize.py + +import cv2 +import numpy as np + +__all__ = ["draw_bounding_boxes"] + +# NOTE: in original files from YoloX 'draw_bounding_boxes' function is named "vis" + + +def draw_bounding_boxes(img, boxes, scores, cls_ids, conf=0.5, class_names=None): + """ + This function draws bounding boxes over the img argument, using + boxes from detections from YoloX. + img is a numpy array from cv2.imread() + Scores refers to the probability of each detection. + cls_ids are the class of each detection + conf is the confidence required to draw the bounding box + class_names is a list, where class_names[cls_ids[i]] should be the name + for the i-th bounding box. + """ + for i in range(len(boxes)): + box = boxes[i] + cls_id = int(cls_ids[i]) + score = scores[i] + if score < conf: + continue + x0 = int(box[0]) + y0 = int(box[1]) + x1 = int(box[2]) + y1 = int(box[3]) + + color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist() + text = "{}:{:.1f}%".format(class_names[cls_id], score * 100) + txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255) + font = cv2.FONT_HERSHEY_SIMPLEX + + txt_size = cv2.getTextSize(text, font, 0.4, 1)[0] + cv2.rectangle(img, (x0, y0), (x1, y1), color, 2) + + txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist() + cv2.rectangle( + img, (x0, y0 + 1), (x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])), txt_bk_color, -1 + ) + cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1) + + return img + + +_COLORS = ( + np.array( + [ + 0.000, + 0.447, + 0.741, + 0.850, + 0.325, + 0.098, + 0.929, + 0.694, + 0.125, + 0.494, + 0.184, + 0.556, + 0.466, + 0.674, + 0.188, + 0.301, + 0.745, + 0.933, + 0.635, + 0.078, + 0.184, + 0.300, + 0.300, + 0.300, + 0.600, + 0.600, + 0.600, + 1.000, + 0.000, + 0.000, + 1.000, + 0.500, + 0.000, + 0.749, + 0.749, + 0.000, + 0.000, + 1.000, + 0.000, + 0.000, + 0.000, + 1.000, + 0.667, + 0.000, + 1.000, + 0.333, + 0.333, + 0.000, + 0.333, + 0.667, + 0.000, + 0.333, + 1.000, + 0.000, + 0.667, + 0.333, + 0.000, + 0.667, + 0.667, + 0.000, + 0.667, + 1.000, + 0.000, + 1.000, + 0.333, + 0.000, + 1.000, + 0.667, + 0.000, + 1.000, + 1.000, + 0.000, + 0.000, + 0.333, + 0.500, + 0.000, + 0.667, + 0.500, + 0.000, + 1.000, + 0.500, + 0.333, + 0.000, + 0.500, + 0.333, + 0.333, + 0.500, + 0.333, + 0.667, + 0.500, + 0.333, + 1.000, + 0.500, + 0.667, + 0.000, + 0.500, + 0.667, + 0.333, + 0.500, + 0.667, + 0.667, + 0.500, + 0.667, + 1.000, + 0.500, + 1.000, + 0.000, + 0.500, + 1.000, + 0.333, + 0.500, + 1.000, + 0.667, + 0.500, + 1.000, + 1.000, + 0.500, + 0.000, + 0.333, + 1.000, + 0.000, + 0.667, + 1.000, + 0.000, + 1.000, + 1.000, + 0.333, + 0.000, + 1.000, + 0.333, + 0.333, + 1.000, + 0.333, + 0.667, + 1.000, + 0.333, + 1.000, + 1.000, + 0.667, + 0.000, + 1.000, + 0.667, + 0.333, + 1.000, + 0.667, + 0.667, + 1.000, + 0.667, + 1.000, + 1.000, + 1.000, + 0.000, + 1.000, + 1.000, + 0.333, + 1.000, + 1.000, + 0.667, + 1.000, + 0.333, + 0.000, + 0.000, + 0.500, + 0.000, + 0.000, + 0.667, + 0.000, + 0.000, + 0.833, + 0.000, + 0.000, + 1.000, + 0.000, + 0.000, + 0.000, + 0.167, + 0.000, + 0.000, + 0.333, + 0.000, + 0.000, + 0.500, + 0.000, + 0.000, + 0.667, + 0.000, + 0.000, + 0.833, + 0.000, + 0.000, + 1.000, + 0.000, + 0.000, + 0.000, + 0.167, + 0.000, + 0.000, + 0.333, + 0.000, + 0.000, + 0.500, + 0.000, + 0.000, + 0.667, + 0.000, + 0.000, + 0.833, + 0.000, + 0.000, + 1.000, + 0.000, + 0.000, + 0.000, + 0.143, + 0.143, + 0.143, + 0.286, + 0.286, + 0.286, + 0.429, + 0.429, + 0.429, + 0.571, + 0.571, + 0.571, + 0.714, + 0.714, + 0.714, + 0.857, + 0.857, + 0.857, + 0.000, + 0.447, + 0.741, + 0.314, + 0.717, + 0.741, + 0.50, + 0.5, + 0, + ] + ) + .astype(np.float32) + .reshape(-1, 3) +)