BREAKING CHANGE: remove legacy detectron2 model; remove layoutparse…

…r extras (#350) ### Summary First step in resolving Unstructured-IO/unstructured#3051. Per [this comment](Unstructured-IO/unstructured#3051 (comment)), we were having troubling running `unstructured` in the Python 3.12 `wolfi-base` contain due to issues related to `pycocotools`, which is only used for the legacy `detectron2` model from `layoutparser`. Since we've replaced this with `detectron2onnx`, this PR removes the `layoutparser` extra dependencies that caused issues with Python 3.12. The `layoutparser` base dependency is still required because we use layout objects from that library. It's likely we could remove these in a future iteration. Temporarily disabled the ingest tests, because they seem to have been broken for the past six months. Last commit that they passed for was [this one](0f0c2be). Opened #352 to reenable them. ### Testing If CI passes we should be good to go.
Unstructured-IO · May 22, 2024 · 7b2125b · 7b2125b
1 parent 81549a7
commit 7b2125b
Show file tree

Hide file tree

Showing 12 changed files with 121 additions and 288 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -104,48 +104,50 @@ jobs:
         CI=true make test
         make check-coverage
 
-  test_ingest:
-    strategy:
-      matrix:
-        python-version: ["3.9","3.10"]
-    runs-on: ubuntu-latest
-    env:
-      NLTK_DATA: ${{ github.workspace }}/nltk_data
-    needs: lint
-    steps:
-    - name: Checkout unstructured repo for integration testing
-      uses: actions/checkout@v4
-      with:
-        repository: 'Unstructured-IO/unstructured'
-    - name: Checkout this repo
-      uses: actions/checkout@v4
-      with:
-        path: inference
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Test
-      env:
-        GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
-        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
-        DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
-      run: |
-        python${{ matrix.python-version }} -m venv .venv
-        source .venv/bin/activate
-        [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
-        make install-ci
-        pip install -e inference/
-        sudo apt-get update
-        sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
-        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get install -y tesseract-ocr
-        sudo apt-get install -y tesseract-ocr-kor
-        sudo apt-get install -y diffstat
-        tesseract --version
-        make install-all-ingest
-        # only run ingest tests that check expected output diffs.
-        bash inference/scripts/test-unstructured-ingest-helper.sh
+  # NOTE(robinson) - disabling ingest tests for now, as of 5/22/2024 they seem to have been
+  # broken for the past six months
+  # test_ingest:
+  #   strategy:
+  #     matrix:
+  #       python-version: ["3.9","3.10"]
+  #   runs-on: ubuntu-latest
+  #   env:
+  #     NLTK_DATA: ${{ github.workspace }}/nltk_data
+  #   needs: lint
+  #   steps:
+  #   - name: Checkout unstructured repo for integration testing
+  #     uses: actions/checkout@v4
+  #     with:
+  #       repository: 'Unstructured-IO/unstructured'
+  #   - name: Checkout this repo
+  #     uses: actions/checkout@v4
+  #     with:
+  #       path: inference
+  #   - name: Set up Python ${{ matrix.python-version }}
+  #     uses: actions/setup-python@v4
+  #     with:
+  #       python-version: ${{ matrix.python-version }}
+  #   - name: Test
+  #     env:
+  #       GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
+  #       SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+  #       DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
+  #     run: |
+  #       python${{ matrix.python-version }} -m venv .venv
+  #       source .venv/bin/activate
+  #       [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA"
+  #       make install-ci
+  #       pip install -e inference/
+  #       sudo apt-get update
+  #       sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
+  #       sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
+  #       sudo apt-get install -y tesseract-ocr
+  #       sudo apt-get install -y tesseract-ocr-kor
+  #       sudo apt-get install -y diffstat
+  #       tesseract --version
+  #       make install-all-ingest
+  #       # only run ingest tests that check expected output diffs.
+  #       bash inference/scripts/test-unstructured-ingest-helper.sh
 
   changelog:
     runs-on: ubuntu-latest

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,17 +1,22 @@
+## 0.7.33
+
+* BREAKING CHANGE: removes legacy detectron2 model
+* deps: remove layoutparser optional dependencies
+
 ## 0.7.32
 
-* refactor: remove all code related to filling inferred elements text from embedded text (pdfminer). 
+* refactor: remove all code related to filling inferred elements text from embedded text (pdfminer).
 * bug: set the Chipper max_length variable
 
 ## 0.7.31
 
-* refactor: remove all `cid` related code that was originally added to filter out invalid `pdfminer` text 
+* refactor: remove all `cid` related code that was originally added to filter out invalid `pdfminer` text
 * enhancement: Wrapped hf_hub_download with a function that checks for local file before checking HF
 
 ## 0.7.30
 
-* fix: table transformer doesn't return multiple cells with same coordinates 
-* 
+* fix: table transformer doesn't return multiple cells with same coordinates
+*
 ## 0.7.29
 
 * fix: table transformer predictions are now removed if confidence is below threshold
@@ -458,4 +463,4 @@ we have the mapping from standard language code to paddle language code.
 
 ## 0.2.0
 
-* Initial release of unstructured-inference
+* Initial release of unstructured-inference
diff --git a/requirements/base.in b/requirements/base.in
@@ -1,10 +1,13 @@
 -c constraints.in
-layoutparser[layoutmodels,tesseract]
+layoutparser
 python-multipart
 huggingface-hub
 opencv-python!=4.7.0.68
 onnx
 onnxruntime>=1.17.0
+matplotlib
+torch
+timm
 # NOTE(alan): Pinned because this is when the most recent module we import appeared
 transformers>=4.25.1
 rapidfuzz
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -4,8 +4,6 @@
 #
 #    pip-compile requirements/base.in
 #
-antlr4-python3-runtime==4.9.3
-    # via omegaconf
 certifi==2024.2.2
     # via requests
 cffi==1.16.0
@@ -18,13 +16,11 @@ coloredlogs==15.0.1
     # via onnxruntime
 contourpy==1.2.1
     # via matplotlib
-cryptography==42.0.5
+cryptography==42.0.7
     # via pdfminer-six
 cycler==0.12.1
     # via matplotlib
-effdet==0.4.1
-    # via layoutparser
-filelock==3.13.4
+filelock==3.14.0
     # via
     #   huggingface-hub
     #   torch
@@ -33,11 +29,11 @@ flatbuffers==24.3.25
     # via onnxruntime
 fonttools==4.51.0
     # via matplotlib
-fsspec==2024.3.1
+fsspec==2024.5.0
     # via
     #   huggingface-hub
     #   torch
-huggingface-hub==0.22.2
+huggingface-hub==0.23.1
     # via
     #   -r requirements/base.in
     #   timm
@@ -51,16 +47,16 @@ importlib-resources==6.4.0
     # via matplotlib
 iopath==0.1.10
     # via layoutparser
-jinja2==3.1.3
+jinja2==3.1.4
     # via torch
 kiwisolver==1.4.5
     # via matplotlib
-layoutparser[layoutmodels,tesseract]==0.3.4
+layoutparser==0.3.4
     # via -r requirements/base.in
 markupsafe==2.1.5
     # via jinja2
-matplotlib==3.8.4
-    # via pycocotools
+matplotlib==3.9.0
+    # via -r requirements/base.in
 mpmath==1.3.0
     # via sympy
 networkx==3.2.1
@@ -74,15 +70,12 @@ numpy==1.26.4
     #   onnxruntime
     #   opencv-python
     #   pandas
-    #   pycocotools
     #   scipy
     #   torchvision
     #   transformers
-omegaconf==2.3.0
-    # via effdet
 onnx==1.16.0
     # via -r requirements/base.in
-onnxruntime==1.17.3
+onnxruntime==1.18.0
     # via -r requirements/base.in
 opencv-python==4.9.0.80
     # via
@@ -93,7 +86,6 @@ packaging==24.0
     #   huggingface-hub
     #   matplotlib
     #   onnxruntime
-    #   pytesseract
     #   transformers
 pandas==2.2.2
     # via layoutparser
@@ -109,24 +101,19 @@ pillow==10.3.0
     #   matplotlib
     #   pdf2image
     #   pdfplumber
-    #   pytesseract
     #   torchvision
 portalocker==2.8.2
     # via iopath
 protobuf==5.26.1
     # via
     #   onnx
     #   onnxruntime
-pycocotools==2.0.7
-    # via effdet
 pycparser==2.22
     # via cffi
 pyparsing==3.1.2
     # via matplotlib
-pypdfium2==4.29.0
+pypdfium2==4.30.0
     # via pdfplumber
-pytesseract==0.3.10
-    # via layoutparser
 python-dateutil==2.9.0.post0
     # via
     #   matplotlib
@@ -139,14 +126,13 @@ pyyaml==6.0.1
     # via
     #   huggingface-hub
     #   layoutparser
-    #   omegaconf
     #   timm
     #   transformers
-rapidfuzz==3.8.1
+rapidfuzz==3.9.1
     # via -r requirements/base.in
-regex==2024.4.16
+regex==2024.5.15
     # via transformers
-requests==2.31.0
+requests==2.32.2
     # via
     #   huggingface-hub
     #   transformers
@@ -162,27 +148,23 @@ sympy==1.12
     # via
     #   onnxruntime
     #   torch
-timm==0.9.16
-    # via effdet
+timm==1.0.3
+    # via -r requirements/base.in
 tokenizers==0.19.1
     # via transformers
-torch==2.2.2
+torch==2.3.0
     # via
-    #   effdet
-    #   layoutparser
+    #   -r requirements/base.in
     #   timm
     #   torchvision
-torchvision==0.17.2
-    # via
-    #   effdet
-    #   layoutparser
-    #   timm
-tqdm==4.66.2
+torchvision==0.18.0
+    # via timm
+tqdm==4.66.4
     # via
     #   huggingface-hub
     #   iopath
     #   transformers
-transformers==4.40.0
+transformers==4.41.0
     # via -r requirements/base.in
 typing-extensions==4.11.0
     # via
@@ -193,5 +175,5 @@ tzdata==2024.1
     # via pandas
 urllib3==2.2.1
     # via requests
-zipp==3.18.1
+zipp==3.18.2
     # via importlib-resources