feat: integration of YoloX for layout detection

* feat: added YoloX model for layout detection, including tests for images and PDF inference * docs: updated README with force_ocr and deleted comment inside test --------- Co-authored-by: Alan Bertl <[email protected]>
Unstructured-IO · Feb 1, 2023 · 297609f · 297609f
1 parent 5d84859
commit 297609f
Show file tree

Hide file tree

Showing 20 changed files with 909 additions and 17 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -24,13 +24,17 @@ jobs:
       uses: actions/setup-python@v4
       with:
         python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install Poppler
+      run: |
+        sudo apt-get update
+        sudo apt-get -y install poppler-utils
     - name: Setup virtual environment (no cache hit)
       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
       run: |
         python${{ env.PYTHON_VERSION }} -m venv .venv
         source .venv/bin/activate
         make install-ci
-
+        
   lint:
     runs-on: ubuntu-latest
     needs: setup
@@ -80,6 +84,10 @@ jobs:
         python${{ env.PYTHON_VERSION }} -m venv .venv
         source .venv/bin/activate
         make install-ci
+    - name: Install Poppler
+      run: |
+        sudo apt-get update
+        sudo apt-get -y install poppler-utils tesseract-ocr
     - name: Test
       run: |
         source .venv/bin/activate

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.2.5
+
+* Add YoloX model for images and PDFs
+
 ## 0.2.5-dev0
 
 * Add generic model interface

diff --git a/Makefile b/Makefile
@@ -29,7 +29,7 @@ install-base-pip-packages:
 
 .PHONY: install-detectron2
 install-detectron2:
-	pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"
+	pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@78d5b4f335005091fe0364ce4775d711ec93566e"
 
 .PHONY: install-test
 install-test:

diff --git a/README.md b/README.md
@@ -78,6 +78,40 @@ If you are using an Apple M1 chip, use `make run-app-dev` instead of `make start
 start the API with hot reloading. The API will run at `http:/localhost:8000`.
 
 View the swagger documentation at `http://localhost:5000/docs`.
+
+## YoloX model
+
+For using the YoloX model the endpoints are: 
+```
+http://localhost:8000/layout_v1/pdf
+http://localhost:8000/layout_v1/image
+```
+For example:
+```
+curl -X 'POST' 'http://localhost:8000/layout/yolox/image' \
+-F 'file=@sample-docs/test-image.jpg' \
+ | jq -C | less -R
+
+curl -X 'POST' 'http://localhost:8000/layout/yolox/pdf' \
+-F 'file=@sample-docs/loremipsum.pdf' \
+ | jq -C | less -R
+```
+
+If your PDF file doesn't have text embedded you can force the use of OCR with
+the parameter force_ocr=True:
+```
+curl -X 'POST' 'http://localhost:8000/layout/yolox/pdf' \
+-F 'file=@sample-docs/loremipsum.pdf' \
+-F force_ocr=true 
+ | jq -C | less -R
+```
+
+or in local:
+
+```
+layout = yolox_local_inference(filename, type="pdf")
+```
+
 ## Security Policy
 
 See our [security policy](https://github.com/Unstructured-IO/unstructured-inference/security/policy) for

diff --git a/requirements/base.txt b/requirements/base.txt
@@ -18,6 +18,8 @@ charset-normalizer==3.0.1
     #   requests
 click==8.1.3
     # via uvicorn
+coloredlogs==15.0.1
+    # via onnxruntime
 contourpy==1.0.7
     # via matplotlib
 cryptography==39.0.0
@@ -30,6 +32,8 @@ fastapi==0.89.1
     # via unstructured-inference (setup.py)
 filelock==3.9.0
     # via huggingface-hub
+flatbuffers==23.1.21
+    # via onnxruntime
 fonttools==4.38.0
     # via matplotlib
 h11==0.14.0
@@ -38,30 +42,39 @@ huggingface-hub==0.12.0
     # via
     #   timm
     #   unstructured-inference (setup.py)
+humanfriendly==10.0
+    # via coloredlogs
 idna==3.4
     # via
     #   anyio
     #   requests
 iopath==0.1.10
     # via layoutparser
+jsons==1.6.3
+    # via unstructured-inference (setup.py)
 kiwisolver==1.4.4
     # via matplotlib
 layoutparser[layoutmodels,tesseract]==0.3.4
     # via unstructured-inference (setup.py)
 matplotlib==3.6.3
     # via pycocotools
+mpmath==1.2.1
+    # via sympy
 numpy==1.24.1
     # via
     #   contourpy
     #   layoutparser
     #   matplotlib
+    #   onnxruntime
     #   opencv-python
     #   pandas
     #   pycocotools
     #   scipy
     #   torchvision
 omegaconf==2.3.0
     # via effdet
+onnxruntime==1.13.1
+    # via unstructured-inference (setup.py)
 opencv-python==4.6.0.66
     # via
     #   layoutparser
@@ -70,6 +83,7 @@ packaging==23.0
     # via
     #   huggingface-hub
     #   matplotlib
+    #   onnxruntime
     #   pytesseract
 pandas==1.5.3
     # via layoutparser
@@ -89,6 +103,8 @@ pillow==9.4.0
     #   torchvision
 portalocker==2.7.0
     # via iopath
+protobuf==4.21.12
+    # via onnxruntime
 pycocotools==2.0.6
     # via effdet
 pycparser==2.21
@@ -127,6 +143,8 @@ sniffio==1.3.0
     # via anyio
 starlette==0.22.0
     # via fastapi
+sympy==1.11.1
+    # via onnxruntime
 timm==0.6.12
     # via effdet
 torch==1.13.1
@@ -152,6 +170,8 @@ typing-extensions==4.4.0
     #   starlette
     #   torch
     #   torchvision
+typish==1.9.3
+    # via jsons
 urllib3==1.26.14
     # via requests
 uvicorn==0.20.0

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -25,7 +25,7 @@ attrs==22.2.0
     # via jsonschema
 backcall==0.2.0
     # via ipython
-beautifulsoup4==4.11.1
+beautifulsoup4==4.11.2
     # via nbconvert
 bleach==6.0.0
     # via nbconvert
@@ -59,7 +59,7 @@ importlib-metadata==6.0.0
     #   nbconvert
 importlib-resources==5.10.2
     # via jsonschema
-ipykernel==6.21.0
+ipykernel==6.20.2
     # via
     #   ipywidgets
     #   jupyter
@@ -111,7 +111,6 @@ jupyter-console==6.4.4
     # via jupyter
 jupyter-core==5.2.0
     # via
-    #   ipykernel
     #   jupyter-client
     #   jupyter-server
     #   nbclassic
@@ -161,6 +160,7 @@ nbformat==5.7.3
     #   notebook
 nest-asyncio==1.5.6
     # via
+    #   ipykernel
     #   nbclassic
     #   notebook
 notebook==6.5.2
@@ -182,7 +182,7 @@ pexpect==4.8.0
     # via ipython
 pickleshare==0.7.5
     # via ipython
-pip-tools==6.12.1
+pip-tools==6.12.2
     # via -r requirements/dev.in
 pkgutil-resolve-name==1.3.10
     # via jsonschema

diff --git a/requirements/test.in b/requirements/test.in
@@ -10,5 +10,7 @@ httpx
 flake8
 mypy
 pytest-cov
+pdf2image>=1.16.2
+huggingface_hub>=0.11.1
 label_studio_sdk
 vcrpy
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -27,6 +27,8 @@ coverage[toml]==7.1.0
     #   pytest-cov
 exceptiongroup==1.1.0
     # via pytest
+filelock==3.9.0
+    # via huggingface-hub
 flake8==6.0.0
     # via -r requirements/test.in
 h11==0.14.0
@@ -35,6 +37,8 @@ httpcore==0.16.3
     # via httpx
 httpx==0.23.3
     # via -r requirements/test.in
+huggingface-hub==0.12.0
+    # via -r requirements/test.in
 idna==3.4
     # via
     #   anyio
@@ -58,9 +62,15 @@ mypy-extensions==0.4.3
     #   black
     #   mypy
 packaging==23.0
-    # via pytest
+    # via
+    #   huggingface-hub
+    #   pytest
 pathspec==0.11.0
     # via black
+pdf2image==1.16.2
+    # via -r requirements/test.in
+pillow==9.4.0
+    # via pdf2image
 platformdirs==2.6.2
     # via black
 pluggy==1.0.0
@@ -76,9 +86,13 @@ pytest==7.2.1
 pytest-cov==4.0.0
     # via -r requirements/test.in
 pyyaml==6.0
-    # via vcrpy
+    # via
+    #   huggingface-hub
+    #   vcrpy
 requests==2.28.2
-    # via label-studio-sdk
+    # via
+    #   huggingface-hub
+    #   label-studio-sdk
 rfc3986[idna2008]==1.5.0
     # via httpx
 six==1.16.0
@@ -94,9 +108,12 @@ tomli==2.0.1
     #   coverage
     #   mypy
     #   pytest
+tqdm==4.64.1
+    # via huggingface-hub
 typing-extensions==4.4.0
     # via
     #   black
+    #   huggingface-hub
     #   mypy
     #   pydantic
 urllib3==1.26.14

diff --git a/sample-docs/empty-document.pdf b/sample-docs/empty-document.pdf
diff --git a/sample-docs/non-embedded.pdf b/sample-docs/non-embedded.pdf
diff --git a/sample-docs/test-image.jpg b/sample-docs/test-image.jpg
diff --git a/setup.py b/setup.py
@@ -57,6 +57,8 @@
         # on RHEL7. We can remove this pin once the following issue from 12/2022 is resolved
         # ref: https://github.com/opencv/opencv-python/issues/772
         "opencv-python==4.6.0.66",
+        "onnxruntime",
+        "jsons"
     ],
     extras_require={},
 )
diff --git a/test_unstructured_inference/models/test_tesseract.py b/test_unstructured_inference/models/test_tesseract.py
@@ -11,6 +11,7 @@ def __init__(self, languages):
 
 def test_load_agent(monkeypatch):
     monkeypatch.setattr(tesseract, "TesseractAgent", MockTesseractAgent)
+    monkeypatch.setattr(tesseract, "ocr_agent", None)
 
     with patch.object(tesseract, "is_pytesseract_available", return_value=True):
         tesseract.load_agent()