Skip to content

Commit

Permalink
fix: ocr when no elements are found in block (#68)
Browse files Browse the repository at this point in the history
Stopgap fix for a bug that causes the parsing procedure to ignore pdf elements that are not contained within the bounds of an inferred/specified layout element.
  • Loading branch information
qued authored Mar 10, 2023
1 parent 237d69d commit 4814a72
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.2.11

* Fixed some cases where image elements were not being OCR'd

## 0.2.10

* Removed control characters from tesseract output
Expand Down
2 changes: 1 addition & 1 deletion scripts/version-sync.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ done
# Version appearing earliest in CHANGELOGFILE will be used as ground truth.
CHANGELOGFILE="CHANGELOG.md"
VERSIONFILE="unstructured_inference/__version__.py"
RE_SEMVER_FULL="(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(-((0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
# Pull out semver appearing earliest in CHANGELOGFILE.
LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$CHANGELOGFILE")

Expand Down
12 changes: 12 additions & 0 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,9 @@ class MockLayout:
def __init__(self, *elements):
self.elements = elements

def __len__(self):
return len(self.elements)

def sort(self, key, inplace):
return self.elements

Expand All @@ -239,6 +242,9 @@ def __iter__(self):
def get_texts(self):
return [el.text for el in self.elements]

def filter_by(self, *args, **kwargs):
return MockLayout()


@pytest.mark.parametrize(
"block_text, layout_texts, expected_text",
Expand Down Expand Up @@ -334,3 +340,9 @@ def test_invalid_ocr_strategy_raises(mock_image):
)
def test_remove_control_characters(text, expected):
assert layout.remove_control_characters(text) == expected


def test_interpret_called_when_filter_empty(mock_image):
with patch("unstructured_inference.inference.layout.interpret_text_block"):
layout.aggregate_by_block(MockTextBlock(), mock_image, MockLayout())
layout.interpret_text_block.assert_called_once()
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.10" # pragma: no cover
__version__ = "0.2.11" # pragma: no cover
7 changes: 7 additions & 0 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,13 @@ def aggregate_by_block(
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""
filtered_blocks = layout.filter_by(text_block, center=True)
# NOTE(alan): For now, if none of the elements discovered by layoutparser are in the block
# we can try interpreting the whole block. This still doesn't handle edge cases, like when there
# are some text elements within the block, but there are image elements overlapping the block
# with text lying within the block. In this case the text in the image would likely be ignored.
if not filtered_blocks:
text = interpret_text_block(text_block, image, ocr_strategy)
return text
for little_block in filtered_blocks:
little_block.text = interpret_text_block(little_block, image, ocr_strategy)
text = " ".join([x for x in filtered_blocks.get_texts() if x])
Expand Down

0 comments on commit 4814a72

Please sign in to comment.