fix: convert to pixels (#74)

Fixed failure to convert points to pixels when loading embedded elements in PDF. Alse added paddleocr dependency for x86_64 machines, and corrected an incompatibility between `LayoutElement` and its `unstructured` equivalent.
Unstructured-IO · Mar 29, 2023 · db173d0 · db173d0
1 parent 43887e6
commit db173d0
Show file tree

Hide file tree

Showing 8 changed files with 59 additions and 27 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.3.0
+
+* Fix for text block detection
+* Add paddleocr dependency to setup for x86_64 machines
+
 ## 0.2.14
 
 * Suppressed processing progress bars

diff --git a/requirements/base.txt b/requirements/base.txt
@@ -22,22 +22,22 @@ coloredlogs==15.0.1
     # via onnxruntime
 contourpy==1.0.7
     # via matplotlib
-cryptography==39.0.2
+cryptography==40.0.1
     # via pdfminer-six
 cycler==0.11.0
     # via matplotlib
 effdet==0.3.0
     # via layoutparser
 fastapi==0.95.0
     # via unstructured-inference (setup.py)
-filelock==3.10.0
+filelock==3.10.7
     # via
     #   huggingface-hub
     #   torch
     #   transformers
 flatbuffers==23.3.3
     # via onnxruntime
-fonttools==4.39.2
+fonttools==4.39.3
     # via matplotlib
 h11==0.14.0
     # via uvicorn
@@ -86,7 +86,7 @@ omegaconf==2.3.0
     # via effdet
 onnxruntime==1.14.1
     # via unstructured-inference (setup.py)
-opencv-python==4.7.0.72
+opencv-python==4.6.0.66
     # via
     #   layoutparser
     #   unstructured-inference (setup.py)
@@ -121,7 +121,7 @@ pycocotools==2.0.6
     # via effdet
 pycparser==2.21
     # via cffi
-pydantic==1.10.6
+pydantic==1.10.7
     # via fastapi
 pyparsing==3.0.9
     # via matplotlib
@@ -133,7 +133,7 @@ python-dateutil==2.8.2
     #   pandas
 python-multipart==0.0.6
     # via unstructured-inference (setup.py)
-pytz==2022.7.1
+pytz==2023.3
     # via pandas
 pyyaml==6.0
     # via
@@ -142,7 +142,7 @@ pyyaml==6.0
     #   omegaconf
     #   timm
     #   transformers
-regex==2022.10.31
+regex==2023.3.23
     # via transformers
 requests==2.28.2
     # via
@@ -161,7 +161,7 @@ sympy==1.11.1
     # via
     #   onnxruntime
     #   torch
-timm==0.6.12
+timm==0.6.13
     # via effdet
 tokenizers==0.13.2
     # via transformers
@@ -181,7 +181,7 @@ tqdm==4.65.0
     #   huggingface-hub
     #   iopath
     #   transformers
-transformers==4.27.2
+transformers==4.27.4
     # via unstructured-inference (setup.py)
 typing-extensions==4.5.0
     # via

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -6,6 +6,10 @@
 #
 anyio==3.6.2
     # via jupyter-server
+appnope==0.1.3
+    # via
+    #   ipykernel
+    #   ipython
 argon2-cffi==21.3.0
     # via
     #   jupyter-server
@@ -31,7 +35,7 @@ cffi==1.15.1
     # via argon2-cffi-bindings
 click==8.1.3
     # via pip-tools
-comm==0.1.2
+comm==0.1.3
     # via ipykernel
 debugpy==1.6.6
     # via ipykernel
@@ -57,14 +61,15 @@ importlib-resources==5.12.0
     # via jsonschema
 ipykernel==6.22.0
     # via
+    #   ipywidgets
     #   jupyter
     #   jupyter-console
     #   nbclassic
     #   notebook
     #   qtconsole
 ipython==8.11.0
     # via
-    #   -r dev.in
+    #   -r requirements/dev.in
     #   ipykernel
     #   ipywidgets
     #   jupyter-console
@@ -73,7 +78,7 @@ ipython-genutils==0.2.0
     #   nbclassic
     #   notebook
     #   qtconsole
-ipywidgets==8.0.5
+ipywidgets==8.0.6
     # via jupyter
 isoduration==20.11.0
     # via jsonschema
@@ -126,7 +131,7 @@ jupyter-server-terminals==0.4.4
     # via jupyter-server
 jupyterlab-pygments==0.2.2
     # via nbconvert
-jupyterlab-widgets==3.0.6
+jupyterlab-widgets==3.0.7
     # via ipywidgets
 markupsafe==2.1.2
     # via
@@ -184,7 +189,7 @@ pip-tools==6.12.3
     # via -r requirements/dev.in
 pkgutil-resolve-name==1.3.10
     # via jsonschema
-platformdirs==3.1.1
+platformdirs==3.2.0
     # via jupyter-core
 prometheus-client==0.16.0
     # via
@@ -234,7 +239,7 @@ pyzmq==25.0.2
     #   qtconsole
 qtconsole==5.4.1
     # via jupyter
-qtpy==2.3.0
+qtpy==2.3.1
     # via qtconsole
 rfc3339-validator==0.1.4
     # via
@@ -269,6 +274,10 @@ terminado==0.17.1
     #   notebook
 tinycss2==1.2.1
     # via nbconvert
+tomli==2.0.1
+    # via
+    #   build
+    #   pyproject-hooks
 tornado==6.2
     # via
     #   ipykernel
@@ -299,7 +308,7 @@ uri-template==1.2.0
     # via jsonschema
 wcwidth==0.2.6
     # via prompt-toolkit
-webcolors==1.12
+webcolors==1.13
     # via jsonschema
 webencodings==0.5.1
     # via
@@ -309,7 +318,7 @@ websocket-client==1.5.1
     # via jupyter-server
 wheel==0.40.0
     # via pip-tools
-widgetsnbextension==4.0.6
+widgetsnbextension==4.0.7
     # via ipywidgets
 zipp==3.15.0
     # via

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -10,7 +10,7 @@ appdirs==1.4.4
     # via label-studio-tools
 attrs==22.2.0
     # via pytest
-black==23.1.0
+black==23.3.0
     # via -r requirements/test.in
 certifi==2022.12.7
     # via
@@ -29,7 +29,7 @@ coverage[toml]==7.2.2
     #   pytest-cov
 exceptiongroup==1.1.1
     # via pytest
-filelock==3.10.0
+filelock==3.10.7
     # via huggingface-hub
 flake8==6.0.0
     # via
@@ -82,13 +82,13 @@ pdf2image==1.16.3
     # via -r requirements/test.in
 pillow==9.4.0
     # via pdf2image
-platformdirs==3.1.1
+platformdirs==3.2.0
     # via black
 pluggy==1.0.0
     # via pytest
 pycodestyle==2.10.0
     # via flake8
-pydantic==1.10.6
+pydantic==1.10.7
     # via label-studio-sdk
 pydocstyle==6.3.0
     # via flake8-docstrings

diff --git a/setup.py b/setup.py
@@ -18,7 +18,6 @@
 limitations under the License.
 """
 from setuptools import setup, find_packages
-from platform import machine
 
 from unstructured_inference.__version__ import __version__
 
@@ -60,6 +59,6 @@
         "opencv-python==4.6.0.66",
         "onnxruntime",
         "transformers",
+        'unstructured.PaddleOCR ; platform_machine=="x86_64"',
     ],
-    extras_require={"paddle-ocr": "unstructured.PaddleOCR"},
 )
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.14"  # pragma: no cover
+__version__ = "0.3.0"  # pragma: no cover
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
@@ -58,6 +58,11 @@ def is_in(self, other: Rectangle, error_margin: Optional[int] = None):
             ]
         )
 
+    @property
+    def coordinates(self):
+        """Gets coordinates of the rectangle"""
+        return ((self.x1, self.y1), (self.x1, self.y2), (self.x2, self.y2), (self.x2, self.y1))
+
 
 @dataclass
 class TextRegion(Rectangle):
@@ -77,7 +82,12 @@ class LayoutElement(TextRegion):
 
     def to_dict(self) -> dict:
         """Converts the class instance to dictionary form."""
-        return self.__dict__
+        out_dict = {
+            "coordinates": self.coordinates,
+            "text": self.text,
+            "type": self.type,
+        }
+        return out_dict
 
     @classmethod
     def from_region(cls, region: Rectangle):

diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -363,12 +363,21 @@ def load_pdf(
         )
         word_objs = [
             TextRegion(
-                x1=word["x0"], y1=word["top"], x2=word["x1"], y2=word["bottom"], text=word["text"]
+                x1=word["x0"] * dpi / 72,
+                y1=word["top"] * dpi / 72,
+                x2=word["x1"] * dpi / 72,
+                y2=word["bottom"] * dpi / 72,
+                text=word["text"],
             )
             for word in plumber_words
         ]
         image_objs = [
-            ImageTextRegion(x1=image["x0"], y1=image["y0"], x2=image["x1"], y2=image["y1"])
+            ImageTextRegion(
+                x1=image["x0"] * dpi / 72,
+                y1=image["y0"] * dpi / 72,
+                x2=image["x1"] * dpi / 72,
+                y2=image["y1"] * dpi / 72,
+            )
             for image in page.images
         ]
         layout = word_objs + image_objs
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.2.14" # pragma: no cover
		__version__ = "0.3.0" # pragma: no cover