Skip to content

Commit

Permalink
chore: bump unstructured to 0.10.23 (#285)
Browse files Browse the repository at this point in the history
Note I added a workaround for [this
bug](Unstructured-IO/unstructured#1754).
  • Loading branch information
awalker4 authored Oct 16, 2023
1 parent 92908f2 commit c0b945e
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 34 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
Expand All @@ -42,7 +42,7 @@ jobs:
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
- name: Lint
run: |
source .venv/bin/activate
Expand All @@ -65,7 +65,7 @@ jobs:
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
- name: Run core tests
run: |
source .venv/bin/activate
Expand Down Expand Up @@ -106,7 +106,7 @@ jobs:
with:
path: |
.venv
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
- name: Test Dockerfile
run: |
source .venv/bin/activate
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 0.0.53-dev0
## 0.0.53

* Bump unstructured to 0.10.23
* Simplify the error message for BadZipFile errors

## 0.0.52
Expand Down
4 changes: 0 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,8 @@ USER ${NB_USER}

FROM python-deps as model-deps

# Note(Austin) - Unstructured 0.10.20 has some broken imports in ingest
# Not relevant here - remove the imports for now
RUN python3.10 -c "import nltk; nltk.download('punkt')" && \
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
sed -i '/Chunker/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \
sed -i '/Embedder/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \
python3.10 -c "from unstructured.ingest.pipeline.initialize import initialize; initialize()"

FROM model-deps as code
Expand Down
11 changes: 9 additions & 2 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def partition_pdf_splits(
# If it's small enough, just process locally
# (Some kwargs need to be renamed for local partition)
if len(pdf_pages) <= pages_per_pdf:
if "hi_res_model_name" in partition_kwargs:
if partition_kwargs.get("hi_res_model_name"):
partition_kwargs["model_name"] = partition_kwargs.pop("hi_res_model_name")

return partition(
Expand Down Expand Up @@ -406,6 +406,13 @@ def pipeline_api(
)
)

# TODO(austin) - Latest unstructured won't accept model_name=None
# Just pass if it's set until the fix is released
# https://github.com/Unstructured-IO/unstructured/issues/1754
kwargs = {}
if hi_res_model_name:
kwargs["model_name"] = hi_res_model_name

# Be careful of naming differences in api params vs partition params!
# These kwargs are going back into the api, not into partition
# If there's a difference, remap the param in partition_pdf_splits
Expand Down Expand Up @@ -440,7 +447,6 @@ def pipeline_api(
# partition_kwargs
encoding=encoding,
include_page_breaks=include_page_breaks,
model_name=hi_res_model_name,
ocr_languages=ocr_languages,
pdf_infer_table_structure=pdf_infer_table_structure,
skip_infer_table_types=skip_infer_table_types,
Expand All @@ -451,6 +457,7 @@ def pipeline_api(
multipage_sections=multipage_sections,
combine_under_n_chars=combine_under_n_chars,
new_after_n_chars=new_after_n_chars,
**kwargs,
)
except ValueError as e:
if "Invalid file" in e.args[0]:
Expand Down
20 changes: 11 additions & 9 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,12 @@ msg-parser==1.2.0
mypy-extensions==1.0.0
# via typing-inspect
networkx==3.1
# via torch
# via
# torch
# unstructured
nltk==3.8.1
# via unstructured
numpy==1.26.0
numpy==1.26.1
# via
# contourpy
# layoutparser
Expand All @@ -137,7 +139,7 @@ olefile==0.46
omegaconf==2.3.0
# via effdet
onnx==1.14.1
# via unstructured-inference
# via unstructured
onnxruntime==1.15.1
# via unstructured-inference
opencv-python==4.8.1.78
Expand Down Expand Up @@ -169,7 +171,7 @@ pdfminer-six==20221105
# unstructured
pdfplumber==0.10.2
# via layoutparser
pillow==10.0.1
pillow==10.1.0
# via
# layoutparser
# matplotlib
Expand All @@ -185,7 +187,7 @@ protobuf==4.24.4
# via
# onnx
# onnxruntime
psutil==5.9.5
psutil==5.9.6
# via -r requirements/base.in
pycocotools==2.0.7
# via effdet
Expand All @@ -211,7 +213,7 @@ python-dateutil==2.8.2
# via
# matplotlib
# pandas
python-docx==1.0.0
python-docx==1.0.1
# via unstructured
python-iso639==2023.6.15
# via unstructured
Expand Down Expand Up @@ -310,9 +312,9 @@ typing-inspect==0.9.0
# via dataclasses-json
tzdata==2023.3
# via pandas
unstructured[local-inference]==0.10.21
unstructured[local-inference]==0.10.23
# via -r requirements/base.in
unstructured-inference==0.7.2
unstructured-inference==0.7.5
# via unstructured
unstructured-pytesseract==0.3.12
# via unstructured
Expand All @@ -322,5 +324,5 @@ uvicorn==0.23.2
# via -r requirements/base.in
xlrd==2.0.1
# via unstructured
xlsxwriter==3.1.7
xlsxwriter==3.1.8
# via python-pptx
29 changes: 15 additions & 14 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,11 +275,11 @@ jupyter-core==5.4.0
# nbconvert
# nbformat
# qtconsole
jupyter-events==0.7.0
jupyter-events==0.8.0
# via jupyter-server
jupyter-lsp==2.2.0
# via jupyterlab
jupyter-server==2.7.3
jupyter-server==2.8.0
# via
# jupyter-lsp
# jupyterlab
Expand All @@ -288,7 +288,7 @@ jupyter-server==2.7.3
# notebook-shim
jupyter-server-terminals==0.4.4
# via jupyter-server
jupyterlab==4.0.6
jupyterlab==4.0.7
# via notebook
jupyterlab-pygments==0.2.2
# via nbconvert
Expand Down Expand Up @@ -377,17 +377,18 @@ networkx==3.1
# via
# -r requirements/base.txt
# torch
# unstructured
nltk==3.8.1
# via
# -r requirements/base.txt
# unstructured
notebook==7.0.4
notebook==7.0.5
# via jupyter
notebook-shim==0.2.3
# via
# jupyterlab
# notebook
numpy==1.26.0
numpy==1.26.1
# via
# -r requirements/base.txt
# contourpy
Expand All @@ -413,7 +414,7 @@ omegaconf==2.3.0
onnx==1.14.1
# via
# -r requirements/base.txt
# unstructured-inference
# unstructured
onnxruntime==1.15.1
# via
# -r requirements/base.txt
Expand Down Expand Up @@ -479,7 +480,7 @@ pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
pillow==10.0.1
pillow==10.1.0
# via
# -r requirements/base.txt
# layoutparser
Expand Down Expand Up @@ -511,7 +512,7 @@ protobuf==4.24.4
# -r requirements/base.txt
# onnx
# onnxruntime
psutil==5.9.5
psutil==5.9.6
# via
# -r requirements/base.txt
# ipykernel
Expand All @@ -525,7 +526,7 @@ pycocotools==2.0.7
# via
# -r requirements/base.txt
# effdet
pycodestyle==2.11.0
pycodestyle==2.11.1
# via flake8
pycparser==2.21
# via
Expand Down Expand Up @@ -578,7 +579,7 @@ python-dateutil==2.8.2
# jupyter-client
# matplotlib
# pandas
python-docx==1.0.0
python-docx==1.0.1
# via
# -r requirements/base.txt
# unstructured
Expand Down Expand Up @@ -658,7 +659,7 @@ rfc3986-validator==0.1.1
# via
# jsonschema
# jupyter-events
rpds-py==0.10.5
rpds-py==0.10.6
# via
# jsonschema
# referencing
Expand Down Expand Up @@ -804,9 +805,9 @@ tzdata==2023.3
# via
# -r requirements/base.txt
# pandas
unstructured[local-inference]==0.10.21
unstructured[local-inference]==0.10.23
# via -r requirements/base.txt
unstructured-inference==0.7.2
unstructured-inference==0.7.5
# via
# -r requirements/base.txt
# unstructured
Expand Down Expand Up @@ -842,7 +843,7 @@ xlrd==2.0.1
# via
# -r requirements/base.txt
# unstructured
xlsxwriter==3.1.7
xlsxwriter==3.1.8
# via
# -r requirements/base.txt
# python-pptx
Expand Down

0 comments on commit c0b945e

Please sign in to comment.