fix: ocr when no elements are found in block (#68)

Stopgap fix for a bug that causes the parsing procedure to ignore pdf elements that are not contained within the bounds of an inferred/specified layout element.
Unstructured-IO · Mar 10, 2023 · 4814a72 · 4814a72
1 parent 237d69d
commit 4814a72
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.2.11
+
+* Fixed some cases where image elements were not being OCR'd
+
 ## 0.2.10
 
 * Removed control characters from tesseract output

diff --git a/scripts/version-sync.sh b/scripts/version-sync.sh
@@ -16,7 +16,7 @@ done
 # Version appearing earliest in CHANGELOGFILE will be used as ground truth.
 CHANGELOGFILE="CHANGELOG.md"
 VERSIONFILE="unstructured_inference/__version__.py"
-RE_SEMVER_FULL="(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(-((0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
+RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
 # Pull out semver appearing earliest in CHANGELOGFILE.
 LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$CHANGELOGFILE")
 

diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -230,6 +230,9 @@ class MockLayout:
     def __init__(self, *elements):
         self.elements = elements
 
+    def __len__(self):
+        return len(self.elements)
+
     def sort(self, key, inplace):
         return self.elements
 
@@ -239,6 +242,9 @@ def __iter__(self):
     def get_texts(self):
         return [el.text for el in self.elements]
 
+    def filter_by(self, *args, **kwargs):
+        return MockLayout()
+
 
 @pytest.mark.parametrize(
     "block_text, layout_texts, expected_text",
@@ -334,3 +340,9 @@ def test_invalid_ocr_strategy_raises(mock_image):
 )
 def test_remove_control_characters(text, expected):
     assert layout.remove_control_characters(text) == expected
+
+
+def test_interpret_called_when_filter_empty(mock_image):
+    with patch("unstructured_inference.inference.layout.interpret_text_block"):
+        layout.aggregate_by_block(MockTextBlock(), mock_image, MockLayout())
+        layout.interpret_text_block.assert_called_once()
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.10"  # pragma: no cover
+__version__ = "0.2.11"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -298,6 +298,13 @@ def aggregate_by_block(
     """Extracts the text aggregated from the elements of the given layout that lie within the given
     block."""
     filtered_blocks = layout.filter_by(text_block, center=True)
+    # NOTE(alan): For now, if none of the elements discovered by layoutparser are in the block
+    # we can try interpreting the whole block. This still doesn't handle edge cases, like when there
+    # are some text elements within the block, but there are image elements overlapping the block
+    # with text lying within the block. In this case the text in the image would likely be ignored.
+    if not filtered_blocks:
+        text = interpret_text_block(text_block, image, ocr_strategy)
+        return text
     for little_block in filtered_blocks:
         little_block.text = interpret_text_block(little_block, image, ocr_strategy)
     text = " ".join([x for x in filtered_blocks.get_texts() if x])
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.2.10" # pragma: no cover
		__version__ = "0.2.11" # pragma: no cover