diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 63393a0c..ad2f06bf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: with: path: | .venv - key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }} + key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }} - name: Set up Python ${{ env.PYTHON_VERSION }} uses: actions/setup-python@v4 with: @@ -42,7 +42,7 @@ jobs: with: path: | .venv - key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }} + key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }} - name: Lint run: | source .venv/bin/activate @@ -65,7 +65,7 @@ jobs: with: path: | .venv - key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }} + key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }} - name: Run core tests run: | source .venv/bin/activate @@ -106,7 +106,7 @@ jobs: with: path: | .venv - key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }} + key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }} - name: Test Dockerfile run: | source .venv/bin/activate diff --git a/CHANGELOG.md b/CHANGELOG.md index 9163b41a..b35c1f1e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.0.53-dev0 +## 0.0.53 +* Bump unstructured to 0.10.23 * Simplify the error message for BadZipFile errors ## 0.0.52 diff --git a/Dockerfile b/Dockerfile index 0d683de1..e1a2904a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,12 +35,8 @@ USER ${NB_USER} FROM python-deps as model-deps -# Note(Austin) - Unstructured 0.10.20 has some broken imports in ingest -# Not relevant here - remove the imports for now RUN python3.10 -c "import nltk; nltk.download('punkt')" && \ python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \ - sed -i '/Chunker/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \ - sed -i '/Embedder/d' ~/.local/lib/python3.10/site-packages/unstructured/ingest/pipeline/__init__.py && \ python3.10 -c "from unstructured.ingest.pipeline.initialize import initialize; initialize()" FROM model-deps as code diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 2b374291..4eeea62e 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -197,7 +197,7 @@ def partition_pdf_splits( # If it's small enough, just process locally # (Some kwargs need to be renamed for local partition) if len(pdf_pages) <= pages_per_pdf: - if "hi_res_model_name" in partition_kwargs: + if partition_kwargs.get("hi_res_model_name"): partition_kwargs["model_name"] = partition_kwargs.pop("hi_res_model_name") return partition( @@ -406,6 +406,13 @@ def pipeline_api( ) ) + # TODO(austin) - Latest unstructured won't accept model_name=None + # Just pass if it's set until the fix is released + # https://github.com/Unstructured-IO/unstructured/issues/1754 + kwargs = {} + if hi_res_model_name: + kwargs["model_name"] = hi_res_model_name + # Be careful of naming differences in api params vs partition params! # These kwargs are going back into the api, not into partition # If there's a difference, remap the param in partition_pdf_splits @@ -440,7 +447,6 @@ def pipeline_api( # partition_kwargs encoding=encoding, include_page_breaks=include_page_breaks, - model_name=hi_res_model_name, ocr_languages=ocr_languages, pdf_infer_table_structure=pdf_infer_table_structure, skip_infer_table_types=skip_infer_table_types, @@ -451,6 +457,7 @@ def pipeline_api( multipage_sections=multipage_sections, combine_under_n_chars=combine_under_n_chars, new_after_n_chars=new_after_n_chars, + **kwargs, ) except ValueError as e: if "Invalid file" in e.args[0]: diff --git a/requirements/base.txt b/requirements/base.txt index 0a0669ec..42735239 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -115,10 +115,12 @@ msg-parser==1.2.0 mypy-extensions==1.0.0 # via typing-inspect networkx==3.1 - # via torch + # via + # torch + # unstructured nltk==3.8.1 # via unstructured -numpy==1.26.0 +numpy==1.26.1 # via # contourpy # layoutparser @@ -137,7 +139,7 @@ olefile==0.46 omegaconf==2.3.0 # via effdet onnx==1.14.1 - # via unstructured-inference + # via unstructured onnxruntime==1.15.1 # via unstructured-inference opencv-python==4.8.1.78 @@ -169,7 +171,7 @@ pdfminer-six==20221105 # unstructured pdfplumber==0.10.2 # via layoutparser -pillow==10.0.1 +pillow==10.1.0 # via # layoutparser # matplotlib @@ -185,7 +187,7 @@ protobuf==4.24.4 # via # onnx # onnxruntime -psutil==5.9.5 +psutil==5.9.6 # via -r requirements/base.in pycocotools==2.0.7 # via effdet @@ -211,7 +213,7 @@ python-dateutil==2.8.2 # via # matplotlib # pandas -python-docx==1.0.0 +python-docx==1.0.1 # via unstructured python-iso639==2023.6.15 # via unstructured @@ -310,9 +312,9 @@ typing-inspect==0.9.0 # via dataclasses-json tzdata==2023.3 # via pandas -unstructured[local-inference]==0.10.21 +unstructured[local-inference]==0.10.23 # via -r requirements/base.in -unstructured-inference==0.7.2 +unstructured-inference==0.7.5 # via unstructured unstructured-pytesseract==0.3.12 # via unstructured @@ -322,5 +324,5 @@ uvicorn==0.23.2 # via -r requirements/base.in xlrd==2.0.1 # via unstructured -xlsxwriter==3.1.7 +xlsxwriter==3.1.8 # via python-pptx diff --git a/requirements/test.txt b/requirements/test.txt index 51da9ff1..77fa5c8f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -275,11 +275,11 @@ jupyter-core==5.4.0 # nbconvert # nbformat # qtconsole -jupyter-events==0.7.0 +jupyter-events==0.8.0 # via jupyter-server jupyter-lsp==2.2.0 # via jupyterlab -jupyter-server==2.7.3 +jupyter-server==2.8.0 # via # jupyter-lsp # jupyterlab @@ -288,7 +288,7 @@ jupyter-server==2.7.3 # notebook-shim jupyter-server-terminals==0.4.4 # via jupyter-server -jupyterlab==4.0.6 +jupyterlab==4.0.7 # via notebook jupyterlab-pygments==0.2.2 # via nbconvert @@ -377,17 +377,18 @@ networkx==3.1 # via # -r requirements/base.txt # torch + # unstructured nltk==3.8.1 # via # -r requirements/base.txt # unstructured -notebook==7.0.4 +notebook==7.0.5 # via jupyter notebook-shim==0.2.3 # via # jupyterlab # notebook -numpy==1.26.0 +numpy==1.26.1 # via # -r requirements/base.txt # contourpy @@ -413,7 +414,7 @@ omegaconf==2.3.0 onnx==1.14.1 # via # -r requirements/base.txt - # unstructured-inference + # unstructured onnxruntime==1.15.1 # via # -r requirements/base.txt @@ -479,7 +480,7 @@ pexpect==4.8.0 # via ipython pickleshare==0.7.5 # via ipython -pillow==10.0.1 +pillow==10.1.0 # via # -r requirements/base.txt # layoutparser @@ -511,7 +512,7 @@ protobuf==4.24.4 # -r requirements/base.txt # onnx # onnxruntime -psutil==5.9.5 +psutil==5.9.6 # via # -r requirements/base.txt # ipykernel @@ -525,7 +526,7 @@ pycocotools==2.0.7 # via # -r requirements/base.txt # effdet -pycodestyle==2.11.0 +pycodestyle==2.11.1 # via flake8 pycparser==2.21 # via @@ -578,7 +579,7 @@ python-dateutil==2.8.2 # jupyter-client # matplotlib # pandas -python-docx==1.0.0 +python-docx==1.0.1 # via # -r requirements/base.txt # unstructured @@ -658,7 +659,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rpds-py==0.10.5 +rpds-py==0.10.6 # via # jsonschema # referencing @@ -804,9 +805,9 @@ tzdata==2023.3 # via # -r requirements/base.txt # pandas -unstructured[local-inference]==0.10.21 +unstructured[local-inference]==0.10.23 # via -r requirements/base.txt -unstructured-inference==0.7.2 +unstructured-inference==0.7.5 # via # -r requirements/base.txt # unstructured @@ -842,7 +843,7 @@ xlrd==2.0.1 # via # -r requirements/base.txt # unstructured -xlsxwriter==3.1.7 +xlsxwriter==3.1.8 # via # -r requirements/base.txt # python-pptx