Skip to content

Commit

Permalink
chore: bump unstructured to 0.10.21 (#280)
Browse files Browse the repository at this point in the history
Add a workaround to the dockerfile for an import error when fetching the
models (see Unstructured-IO/unstructured#1717)
  • Loading branch information
awalker4 authored Oct 12, 2023
1 parent 5b8a57f commit 528849e
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 87 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
uses: ludeeus/action-shellcheck@master

test:
runs-on: ubuntu-latest
runs-on: ubuntu-latest-m
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -97,7 +97,7 @@ jobs:
# TODO - figure out best practice for caching docker images
# (Using the virtualenv to get pytest)
test_dockerfile:
runs-on: ubuntu-latest
runs-on: ubuntu-latest-m
needs: [setup, lint]
steps:
- uses: actions/checkout@v4
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 0.0.52-dev1
## 0.0.52

* Bump unstructured to 0.10.21
* Fix an unhandled error when a non pdf file is sent with content-type pdf
* Fix unhandled error when a non docx file is sent with content-type docx

Expand Down
7 changes: 6 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,14 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} \
USER ${NB_USER}

FROM python-deps as model-deps

# Note(Austin) - Unstructured 0.10.20 has some broken imports in ingest
# Not relevant here - remove the imports for now
RUN python3.10 -c "import nltk; nltk.download('punkt')" && \
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
python3.10 -c "from unstructured.ingest.doc_processor.generalized import initialize; initialize()"
sed -i '/Chunker/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \
sed -i '/Embedder/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \
python3.10 -c "from unstructured.ingest.pipeline.initialize import initialize; initialize()"

FROM model-deps as code
COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md
Expand Down
8 changes: 4 additions & 4 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,12 +237,12 @@ def pipeline_api(
m_encoding=[],
m_hi_res_model_name=[],
m_include_page_breaks=[],
m_ocr_languages=[],
m_ocr_languages=None,
m_pdf_infer_table_structure=[],
m_skip_infer_table_types=[],
m_strategy=[],
m_xml_keep_tags=[],
languages=["eng"],
languages=None,
m_chunking_strategy=[],
m_multipage_sections=[],
m_combine_under_n_chars=[],
Expand Down Expand Up @@ -608,12 +608,12 @@ def pipeline_1(
encoding: List[str] = Form(default=[]),
hi_res_model_name: List[str] = Form(default=[]),
include_page_breaks: List[str] = Form(default=[]),
ocr_languages: List[str] = Form(default=[]),
ocr_languages: List[str] = Form(default=None),
pdf_infer_table_structure: List[str] = Form(default=[]),
skip_infer_table_types: List[str] = Form(default=[]),
strategy: List[str] = Form(default=[]),
xml_keep_tags: List[str] = Form(default=[]),
languages: List[str] = ["eng"],
languages: List[str] = Form(default=None),
chunking_strategy: List[str] = Form(default=[]),
multipage_sections: List[str] = Form(default=[]),
combine_under_n_chars: List[str] = Form(default=[]),
Expand Down
35 changes: 20 additions & 15 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ anyio==3.7.1
# fastapi
# starlette
backoff==2.2.1
# via -r requirements/base.in
# via
# -r requirements/base.in
# unstructured
beautifulsoup4==4.12.2
# via unstructured
certifi==2023.7.22
Expand All @@ -35,7 +37,7 @@ contourpy==1.1.1
# via matplotlib
cryptography==41.0.4
# via pdfminer-six
cycler==0.12.0
cycler==0.12.1
# via matplotlib
dataclasses-json==0.6.1
# via unstructured
Expand All @@ -60,15 +62,15 @@ filetype==1.2.0
# via unstructured
flatbuffers==23.5.26
# via onnxruntime
fonttools==4.43.0
fonttools==4.43.1
# via matplotlib
fsspec==2023.9.2
# via
# huggingface-hub
# torch
h11==0.14.0
# via uvicorn
huggingface-hub==0.16.4
huggingface-hub==0.17.3
# via
# timm
# tokenizers
Expand Down Expand Up @@ -98,7 +100,7 @@ lxml==4.9.3
# python-docx
# python-pptx
# unstructured
markdown==3.4.4
markdown==3.5
# via unstructured
markupsafe==2.1.3
# via jinja2
Expand Down Expand Up @@ -136,7 +138,7 @@ omegaconf==2.3.0
# via effdet
onnx==1.14.1
# via unstructured-inference
onnxruntime==1.16.0
onnxruntime==1.15.1
# via unstructured-inference
opencv-python==4.8.1.78
# via
Expand Down Expand Up @@ -199,17 +201,17 @@ pypandoc==1.11
# via unstructured
pyparsing==3.1.1
# via matplotlib
pypdf==3.16.2
pypdf==3.16.4
# via -r requirements/base.in
pypdfium2==4.20.0
pypdfium2==4.21.0
# via pdfplumber
pytesseract==0.3.10
# via layoutparser
python-dateutil==2.8.2
# via
# matplotlib
# pandas
python-docx==0.8.11
python-docx==1.0.0
# via unstructured
python-iso639==2023.6.15
# via unstructured
Expand All @@ -228,8 +230,10 @@ pyyaml==6.0.1
# omegaconf
# timm
# transformers
rapidfuzz==3.3.1
# via unstructured-inference
rapidfuzz==3.4.0
# via
# unstructured
# unstructured-inference
ratelimit==2.2.1
# via -r requirements/base.in
regex==2023.10.3
Expand Down Expand Up @@ -269,7 +273,7 @@ tabulate==0.9.0
# via unstructured
timm==0.9.7
# via effdet
tokenizers==0.14.0
tokenizers==0.14.1
# via transformers
torch==2.1.0
# via
Expand Down Expand Up @@ -298,16 +302,17 @@ typing-extensions==4.8.0
# onnx
# pydantic
# pydantic-core
# python-docx
# torch
# typing-inspect
# uvicorn
typing-inspect==0.9.0
# via dataclasses-json
tzdata==2023.3
# via pandas
unstructured[local-inference]==0.10.19
unstructured[local-inference]==0.10.21
# via -r requirements/base.in
unstructured-inference==0.6.6
unstructured-inference==0.7.2
# via unstructured
unstructured-pytesseract==0.3.12
# via unstructured
Expand All @@ -317,5 +322,5 @@ uvicorn==0.23.2
# via -r requirements/base.in
xlrd==2.0.1
# via unstructured
xlsxwriter==3.1.6
xlsxwriter==3.1.7
# via python-pptx
44 changes: 24 additions & 20 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,17 @@ babel==2.13.0
backcall==0.2.0
# via ipython
backoff==2.2.1
# via -r requirements/base.txt
# via
# -r requirements/base.txt
# unstructured
beautifulsoup4==4.12.2
# via
# -r requirements/base.txt
# nbconvert
# unstructured
black==23.9.1
# via -r requirements/test.in
bleach==6.0.0
bleach==6.1.0
# via nbconvert
certifi==2023.7.22
# via
Expand Down Expand Up @@ -97,7 +99,7 @@ cryptography==41.0.4
# via
# -r requirements/base.txt
# pdfminer-six
cycler==0.12.0
cycler==0.12.1
# via
# -r requirements/base.txt
# matplotlib
Expand Down Expand Up @@ -162,7 +164,7 @@ flatbuffers==23.5.26
# via
# -r requirements/base.txt
# onnxruntime
fonttools==4.43.0
fonttools==4.43.1
# via
# -r requirements/base.txt
# matplotlib
Expand All @@ -184,7 +186,7 @@ httpcore==0.18.0
# via httpx
httpx==0.25.0
# via -r requirements/test.in
huggingface-hub==0.16.4
huggingface-hub==0.17.3
# via
# -r requirements/base.txt
# timm
Expand Down Expand Up @@ -253,7 +255,7 @@ jsonschema-specifications==2023.7.1
# via jsonschema
jupyter==1.0.0
# via -r requirements/test.in
jupyter-client==8.3.1
jupyter-client==8.4.0
# via
# ipykernel
# jupyter-console
Expand All @@ -262,7 +264,7 @@ jupyter-client==8.3.1
# qtconsole
jupyter-console==6.6.3
# via jupyter
jupyter-core==5.3.2
jupyter-core==5.4.0
# via
# ipykernel
# jupyter-client
Expand Down Expand Up @@ -315,7 +317,7 @@ lxml==4.9.3
# python-docx
# python-pptx
# unstructured
markdown==3.4.4
markdown==3.5
# via
# -r requirements/base.txt
# unstructured
Expand Down Expand Up @@ -348,7 +350,7 @@ msg-parser==1.2.0
# via
# -r requirements/base.txt
# unstructured
mypy==1.5.1
mypy==1.6.0
# via -r requirements/test.in
mypy-extensions==1.0.0
# via
Expand Down Expand Up @@ -412,7 +414,7 @@ onnx==1.14.1
# via
# -r requirements/base.txt
# unstructured-inference
onnxruntime==1.16.0
onnxruntime==1.15.1
# via
# -r requirements/base.txt
# unstructured-inference
Expand Down Expand Up @@ -551,9 +553,9 @@ pyparsing==3.1.1
# via
# -r requirements/base.txt
# matplotlib
pypdf==3.16.2
pypdf==3.16.4
# via -r requirements/base.txt
pypdfium2==4.20.0
pypdfium2==4.21.0
# via
# -r requirements/base.txt
# pdfplumber
Expand All @@ -576,7 +578,7 @@ python-dateutil==2.8.2
# jupyter-client
# matplotlib
# pandas
python-docx==0.8.11
python-docx==1.0.0
# via
# -r requirements/base.txt
# unstructured
Expand Down Expand Up @@ -623,9 +625,10 @@ qtconsole==5.4.4
# via jupyter
qtpy==2.4.0
# via qtconsole
rapidfuzz==3.3.1
rapidfuzz==3.4.0
# via
# -r requirements/base.txt
# unstructured
# unstructured-inference
ratelimit==2.2.1
# via -r requirements/base.txt
Expand Down Expand Up @@ -655,7 +658,7 @@ rfc3986-validator==0.1.1
# via
# jsonschema
# jupyter-events
rpds-py==0.10.4
rpds-py==0.10.5
# via
# jsonschema
# referencing
Expand Down Expand Up @@ -716,7 +719,7 @@ timm==0.9.7
# effdet
tinycss2==1.2.1
# via nbconvert
tokenizers==0.14.0
tokenizers==0.14.1
# via
# -r requirements/base.txt
# transformers
Expand Down Expand Up @@ -789,6 +792,7 @@ typing-extensions==4.8.0
# mypy
# onnx
# pydantic
# python-docx
# torch
# typing-inspect
# uvicorn
Expand All @@ -800,9 +804,9 @@ tzdata==2023.3
# via
# -r requirements/base.txt
# pandas
unstructured[local-inference]==0.10.19
unstructured[local-inference]==0.10.21
# via -r requirements/base.txt
unstructured-inference==0.6.6
unstructured-inference==0.7.2
# via
# -r requirements/base.txt
# unstructured
Expand All @@ -828,7 +832,7 @@ webencodings==0.5.1
# via
# bleach
# tinycss2
websocket-client==1.6.3
websocket-client==1.6.4
# via jupyter-server
wheel==0.41.2
# via astunparse
Expand All @@ -838,7 +842,7 @@ xlrd==2.0.1
# via
# -r requirements/base.txt
# unstructured
xlsxwriter==3.1.6
xlsxwriter==3.1.7
# via
# -r requirements/base.txt
# python-pptx
Expand Down
Loading

0 comments on commit 528849e

Please sign in to comment.